Esempio n. 1
0
    def test_many_cols_and_values_with_syn(self):
        SEED = random.randint(0, sys.maxint)
        print "\nUsing random seed:", SEED
        # SEED =
        random.seed(SEED)
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 10, 'cA', 5),
            (100, 1000, 'cB', 5),
            # (100, 900, 'cC', 5),
            # (100, 500, 'cD', 5),
            # (100, 100, 'cE', 5),
            ]
        
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            for sel in range(48): # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                selKey2 = key2 + "_" + str(sel)
                parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=5)
                print csvFilename, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']
                inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
                print "\n" + csvFilename

                if not h2o.browse_disable:
                    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                    time.sleep(3)
Esempio n. 2
0
    def test_putfile_a5m(self):
        timeoutSecs = 500
        csvFilenameList = [
            # use different names for each parse 
            # doesn't fail if gzipped?
            ("a5m.csv", 'A', None),
            ("a5m.csv", 'B', None),
            ("a5m.csv", 'C', None),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, key, trees) in csvFilenameList:
            csvPathname = csvFilename

            # creates csvFilename and csvFilename.hex  keys
            parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRF(trees=trees,depth=25,parseResult=parseResult,
                    timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 3
0
    def test_parse_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5000, 'cA', 60),
            (100, 6000, 'cB', 60),
            (100, 7000, 'cC', 60),
            (100, 8000, 'cD', 60),
            (100, 8200, 'cE', 60),
            (100, 8500, 'cF', 60),
            (100, 9000, 'cG', 60),
            (100, 10000, 'cI', 60),
            (100, 11000, 'cH', 60),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 4
0
    def test_many_cols_with_syn(self):
        ### h2b.browseTheCloud()

        csvFilename = "logreg_trisum_int_cat_10000x10.csv"
        csvPathname = "smalldata/logreg/" + csvFilename
        key2 = csvFilename + ".hex"

        parseKey = h2o_cmd.parseFile(None, h2o.find_file(csvPathname), key2=key2, timeoutSecs=10)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", parseKey['destination_key']

        # We should be able to see the parse result?
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvFilename

        paramDict = define_params()
        paramDict2 = {}
        for k in paramDict:
            # sometimes we have a list to pick from in the value. now it's just list of 1.
            paramDict2[k] = paramDict[k][0]

        y = 10
        # FIX! what should we have for case? 1 should be okay because we have 1's in output col
        kwargs = {'y': y, 'max_iter': 50}
        kwargs.update(paramDict2)

        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=20, **kwargs)
        print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs)

        if not h2o.browse_disable:
            h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            time.sleep(5)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = ["covtype.data"]
        else:
            csvFilenameList = [
                "covtype200x.data",
                "covtype200x.data",
                "covtype.data",
                "covtype.data",
                "covtype20x.data",
                "covtype20x.data",
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = "/home/0xdiag/datasets/standard"
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60)
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey["destination_key"])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm["GLMModel"]
            coefficients = GLMModel["coefficients"]
            validationsList = GLMModel["validations"]
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, "err", validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write(".")
            sys.stdout.flush()
Esempio n. 6
0
    def test_parse_many_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5000, 'cA', 60),
            (100, 6000, 'cB', 60),
            (100, 7000, 'cC', 60),
            (100, 8000, 'cD', 60),
            (100, 8200, 'cE', 60),
            (100, 8500, 'cF', 60),
            (100, 9000, 'cG', 60),
            (100, 10000, 'cI', 60),
            (100, 11000, 'cH', 60),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 7
0
    def test_GLM2_many_cols_tridist(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000, 10, 'cA', 300),
            (10000, 20, 'cB', 300),
            (10000, 30, 'cC', 300),
            (10000, 40, 'cD', 300),
            (10000, 50, 'cE', 300),
            (10000, 60, 'cF', 300),
            (10000, 70, 'cG', 300),
            (10000, 80, 'cH', 300),
            (10000, 90, 'cI', 300),
            (10000, 100, 'cJ', 300),
            (10000, 200, 'cK', 300),
            (10000, 300, 'cL', 300),
            (10000, 400, 'cM', 300),
            (10000, 500, 'cN', 300),
            (10000, 600, 'cO', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30)
            print "\nParse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]

            y = colCount
            kwargs = {'response': y}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, 'C9', **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 8
0
    def test_parse_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5000, 'cA', 10),
            (100, 6000, 'cB', 10),
            (100, 7000, 'cC', 10),
            (100, 8000, 'cD', 10),
            (100, 8200, 'cE', 10),
            (100, 8500, 'cF', 10),
            (100, 9000, 'cG', 10),
            (100, 10000, 'cI', 10),
            (100, 11000, 'cH', 10),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=60)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 9
0
    def test_rf_multinomial_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_multinomial.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 400
        colCount = 7

        for trial in range (5):
            write_syn_dataset(csvPathname, totalRows, colCount, headerData)
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hexKey = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs)
            print "trial #", trial, 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            modelKey = rfView['drf_model']['_key']
            h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, 
                vactual=colCount+1, vpredict=1, expectedAuc=0.5, doAUC=False)

            h2b.browseJsonHistoryAsUrlLastMatch("RF")
Esempio n. 10
0
    def test_many_cols_and_values_with_syn(self):
        SEED = random.randint(0, sys.maxint)
        print "\nUsing random seed:", SEED
        # SEED =
        random.seed(SEED)
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 10, "cA", 5),
            (100, 1000, "cB", 5),
            # (100, 900, 'cC', 5),
            # (100, 500, 'cD', 5),
            # (100, 100, 'cE', 5),
        ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            for sel in range(48):  # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                selKey2 = key2 + "_" + str(sel)
                parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=5)
                print csvFilename, "parse time:", parseKey["response"]["time"]
                print "Parse result['destination_key']:", parseKey["destination_key"]
                inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
                print "\n" + csvFilename

                if not h2o.browse_disable:
                    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                    time.sleep(3)
Esempio n. 11
0
    def test_rf_kddcup_1999(self):
        # since we'll be waiting, pop a browser
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        csvFilename = 'kddcup_1999.data.gz'

        print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\
            "compared to running with the parameters specified and matching the browser RF query defaults. " +\
            "Also run the param for full scoring vs OOBE scoring."

        parseKey = h2i.parseImportFolderFile(None,
                                             csvFilename,
                                             importFolderPath,
                                             timeoutSecs=300)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        for trials in range(4):
            print "\n" + csvFilename, "Trial #", trials
            start = time.time()

            kwargs = {
                'response_variable': 'classifier',
                'ntree': 200,
                'gini': 1,
                'class_weights': None,
                'stratify': 0,
                # 'features': None,
                'features': 7,
                'ignore': None,
                'sample': 67,
                'bin_limit': 1024,
                'depth': 2147483647,
                'seed': 784834182943470027,
                'parallel': 1,
                'exclusive_split_limit': None,
            }

            if trials == 0:
                kwargs = {}
            elif trials == 1:
                kwargs['out_of_bag_error_estimate'] = None
            elif trials == 2:
                kwargs['out_of_bag_error_estimate'] = 0
            elif trials == 3:
                kwargs['out_of_bag_error_estimate'] = 1

            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=50,
                                       parseKey=parseKey,
                                       timeoutSecs=300,
                                       retryDelaySecs=1.0,
                                       **kwargs)
            print "RF end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
Esempio n. 12
0
    def test_putfile_a5m(self):
        timeoutSecs = 500
        csvFilenameList = [
            # use different names for each parse 
            # doesn't fail if gzipped?
            ("a5m.csv", 'A', None),
            ("a5m.csv", 'B', None),
            ("a5m.csv", 'C', None),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, key, trees) in csvFilenameList:
            csvPathname = csvFilename

            # creates csvFilename and csvFilename.hex  keys
            parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult,
                    timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 13
0
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'covtype.data',
                ]
        else:
            csvFilenameList = [
                'covtype200x.data',
                'covtype200x.data',
                'covtype.data',
                'covtype.data',
                'covtype20x.data',
                'covtype20x.data',
                ]

        # a browser window too, just because we can
        ## h2b.browseTheCloud()
        importFolderPath = "standard"
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 14
0
    def test_parse_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5000, "cA", 10),
            (100, 6000, "cB", 10),
            (100, 7000, "cC", 10),
            (100, 8000, "cD", 10),
            (100, 8200, "cE", 10),
            (100, 8500, "cF", 10),
            (100, 9000, "cG", 10),
            (100, 10000, "cI", 10),
            (100, 11000, "cH", 10),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]
            inspect = h2o_cmd.runInspect(None, parseKey["destination_key"], timeoutSecs=60)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 15
0
    def test_import_multi_syn_datasets(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)

        print "This imports a folder of csv files..i.e points to syn_datasets with no regex"
        print "Doesn't put anything in syn_datasets. When run with import folder redirected"
        print "to import S3, there is a syn_datasets with 100 files"
        print "FIX! When run locally, I should have some multi-files in", importFolderPath, "/syn_datasets?"
        timeoutSecs = 500
        if h2o.nodes[0].redirect_import_folder_to_s3_path:
            csvFilenameAll = [
                # FIX! ..just folder doesn't appear to work. add regex
                # need a destination_key...h2o seems to use the regex if I don't provide one
                ### "syn_datasets/*",
                "syn_datasets/*_10000x200*",
            ]
        else:
            csvFilenameAll = [
                # FIX! ..just folder doesn't appear to work. add regex
                # need a destination_key...h2o seems to use the regex if I don't provide one
                ### "syn_datasets/*",
                "syn_datasets/*",
            ]

        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2="syn_datasets.hex",
                                                 timeoutSecs=500)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "from all files num_rows:", "{:,}".format(inspect['num_rows']), \
                "num_cols:", "{:,}".format(inspect['num_cols'])

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            kwargs = {'sample': 75, 'depth': 25, 'ntree': 1}
            start = time.time()
            RFview = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)

            # so we can see!
            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            time.sleep(5)
Esempio n. 16
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cI', 5),
            (100, 5000, 'cA', 5),
            (100, 6000, 'cB', 5),
            (100, 7000, 'cC', 5),
            (100, 8000, 'cD', 5),
            (100, 8200, 'cE', 5),
            (100, 8500, 'cF', 5),
            (100, 9000, 'cG', 5),
            (100, 11000, 'cH', 5),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 17
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cI', 5),
            (100, 11000, 'cH', 5),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=120)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 18
0
    def test_many_cols_01(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5000, 'cA', 5),
            (100, 10000, 'cI', 5),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=120, 
                doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=120)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
    def test_GLM_with_logit_result_1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 5, 'cA', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, \
                "using random coefficients and intercept and logit eqn. for output"
            (coefficients,
             intercept) = gen_rand_equation(colCount, SEEDPERFILE)
            print coefficients, intercept
            write_syn_dataset(csvPathname, rowCount, colCount, coefficients,
                              intercept, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 60,
                'lambda': 1e-4,
                'alpha': 0,
                'weight': 1.0,
                'n_folds': 3,
                'beta_epsilon': 1e-4,
                'thresholds': 0.5,
            }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            (warnings, coefficients,
             intercept) = h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(5)
Esempio n. 20
0
    def test_exec2_int2cat_nested(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000,  10, 'cA', 100),
            (1000,  20, 'cB', 100),
            (1000,  30, 'cC', 100),
            (1000,  40, 'cD', 100),
            (1000,  10, 'cE', 100),
            (1000,  20, 'cF', 100),
            (1000,  30, 'cG', 100),
            (1000,  40, 'cH', 100),
            ]

        ### h2b.browseTheCloud()
        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
                '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                '<keyX>[,1] = factor(<keyX>[,<col2>]);',
                '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                '<keyX>[,1] = factor(<keyX>[,<col2>]);' \
                ]

        exprList = [
                '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                '<keyX>[,1] = factor(<keyX>[,<col2>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                '<keyX>[,1] = factor(<keyX>[,<col2>]);' \
                ]

        exprList = [
                '<keyX>[,2] = factor(<keyX>[,2])',
                ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the exec commands across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(None, exprList, hex_key, maxCol=colCount, 
                timeoutSecs=30, incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
Esempio n. 21
0
    def test_many_cols_real(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 100, 'cA', 300),
            (1000, 200, 'cB', 300),
            (1000, 300, 'cC', 300),
            (1000, 400, 'cD', 300),
            (1000, 500, 'cE', 300),
            (1000, 1000, 'cJ', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 50,
                'case': '1',
                'case_mode': '=',
                'lambda': 1e-4,
                'alpha': 0.6
            }
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         offset=100,
                                         view=100)
    def test_GLM_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000,  10, 'cA.hex', 100),
            (10000,  20, 'cB.hex', 200),
            (10000,  30, 'cC.hex', 300),
            (10000,  40, 'cD.hex', 400),
            (10000,  50, 'cE.hex', 500),
            ]

        ### h2b.browseTheCloud()

        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
                '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))',
                ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])',
            ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, 
                timeoutSecs=90, incrementingResult=False)
            print "\nexec colResultList", colResultList

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]
            # since we add the output twice, it's no longer colCount-1
            y = colCount
            kwargs = {'y': y, 'max_iter': 50, 'case': 1}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            # only col y-1 (next to last)doesn't get renamed in coefficients 
            # due to enum/categorical expansion
            print "y:", y 
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(3)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
Esempio n. 23
0
 def test_cs_training(self):
     h2o_cmd.runRF(trees=100,
                   depth=100,
                   csvPathname=h2o.find_file(
                       'smalldata/kaggle/creditsample-training.csv.gz'),
                   timeoutSecs=300,
                   response_variable=1)
     h2b.browseJsonHistoryAsUrlLastMatch("RFView")
Esempio n. 24
0
    def test_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 10, 'cA', 100),
            (1000, 20, 'cB', 100),
            (1000, 30, 'cC', 100),
            (1000, 40, 'cD', 100),
            (1000, 10, 'cE', 100),
            (1000, 20, 'cF', 100),
            (1000, 30, 'cG', 100),
            (1000, 40, 'cH', 100),
        ]

        ### h2b.browseTheCloud()
        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        if 1 == 0:
            exprList = [
                '<keyX> = colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' +
                'colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' +
                '<keyX>[0]' + '))))',
            ]
        else:
            exprList = [
                '<keyX> = colSwap(<keyX>,<col1>,' + '<keyX>[0]' + ')',
            ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(
                None,
                exprList,
                key2,
                maxCol=colCount,
                timeoutSecs=30,
                incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
    def test_GLM_with_logit_result_1_NA(self):
        print "Put NAs in col 1...all of col 1 is empty"
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 5, 'cA', 300), 
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, \
                "using random coefficients and intercept and logit eqn. for output"
            (coefficients, intercept) = gen_rand_equation(colCount, SEEDPERFILE)
            print coefficients, intercept
            write_syn_dataset(csvPathname, rowCount, colCount, coefficients, intercept, SEEDPERFILE)


            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            print "Don't specify x to GLM, even though there are NA's on col 1"
            # FIX! should check inspect missing
            n = inspect['cols'][1]['num_missing_values']
            print "num_missing_values in col 1:", n
            self.assertEqual(n, rowCount, \
                msg="Expect col 1 to have num_missing_values: %d equal to rowCount: %d" % (n, rowCount))
            

            kwargs = {
                    'y': y, 
                    'max_iter': 60, 
                    'lambda': 1e-4,
                    'alpha': 0,
                    'weight': 1.0,
                    'n_folds': 3,
                    'beta_epsilon': 1e-4,
                    'thresholds': 0.5,
                    }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(5)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'YearPredictionMSD.txt'
                ]
        else:
            csvFilenameList = [
                'YearPredictionMSD.txt'
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)

            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 27
0
    def test_json_browse_both_exec(self):
        lenNodes = len(h2o.nodes)
        csvPathname = 'standard/covtype.data'
        hex_key = 'c.hex'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
        print "\nParse key is:", parseResult['destination_key']

        ## h2b.browseTheCloud()
        # for trial in range(53):
        trial = 0
        while (trial < 100):
            for exprTemplate in exprList:
                trial = trial + 1
                n = trial
                colX = random.randint(1,54)
                row = random.randint(1,400000)

                execExpr = exprTemplate
                execExpr = re.sub('<col1>',str(colX),execExpr)
                execExpr = re.sub('<col2>',str(colX+1),execExpr)
                execExpr = re.sub('<n>',str(n),execExpr)
                execExpr = re.sub('<row>',str(row),execExpr)
                execExpr = re.sub('<keyX>',str(hex_key),execExpr)

                # pick a random node to execute it on
                randNode = random.randint(0,lenNodes-1)
                print "\nexecExpr:", execExpr, "on node", randNode

                start = time.time()
                resultExec = h2o_cmd.runExec(node=h2o.nodes[randNode], 
                    execExpr=execExpr, timeoutSecs=15)
                h2o.verboseprint(h2o.dump_json(resultExec))
                # print(h2o.dump_json(resultExec))

                # FIX! race conditions. If json is done, does that mean you can inspect it??
                # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist
                if trial > 1:
                    inspectMe = random.choice(inspectList)
                    resultInspect = h2o.nodes[0].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[1].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[2].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                # FIX! if we race the browser doing the exec too..it shouldn't be a problem?
                # might be a bug?

                # WARNING! we can't browse the Exec url history, since that will 
                # cause the Exec to execute again thru the browser..i.e. it has side effects
                # just look at the last inspect, which should be the resultInspect!
                # h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                h2b.browseJsonHistoryAsUrlLastMatch("Exec")
                h2o.check_sandbox_for_errors()
                print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
                print "Trial #", trial, "completed\n"
Esempio n. 28
0
    def test_many_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cI', 5),
            (100, 5000, 'cA', 5),
            (100, 6000, 'cB', 5),
            (100, 7000, 'cC', 5),
            (100, 8000, 'cD', 5),
            (100, 8200, 'cE', 5),
            (100, 8500, 'cF', 5),
            (100, 9000, 'cG', 5),
            (100, 11000, 'cH', 5),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=120,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         offset=100,
                                         view=100)
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         offset=99,
                                         view=89)
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         offset=-1,
                                         view=53)
Esempio n. 29
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cI', 5),
            (100, 5000, 'cA', 5),
            (100, 6000, 'cB', 5),
            (100, 7000, 'cC', 5),
            (100, 8000, 'cD', 5),
            (100, 8200, 'cE', 5),
            (100, 8500, 'cF', 5),
            (100, 9000, 'cG', 5),
            (100, 11000, 'cH', 5),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         offset=100,
                                         view=100)
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         offset=99,
                                         view=89)
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         offset=-1,
                                         view=53)
Esempio n. 30
0
    def test_from_import(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'

        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            # "covtype200x.data",
            # "100million_rows.csv",
            # "200million_rows.csv",
            # "a5m.csv",
            # "a10m.csv",
            # "a100m.csv",
            # "a200m.csv",
            # "a400m.csv",
            # "a600m.csv",
            # "billion_rows.csv.gz",
            # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25
            RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey,
                timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 31
0
    def test_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11000, 0, 'cA', 180),
            (100, 10000, 1, 'cB', 180),
            (100, 9000, 0, 'cC', 180),
            (100, 8000, 1, 'cD', 180),
            (100, 7000, 0, 'cE', 180),
            (100, 6000, 1, 'cF', 180),
            (100, 5000, 0, 'cG', 180),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        # it's interesting to force the first enum row to be used as header or not
        # with many cols, we tend to hit limits about stuff fitting in a chunk (header or data)
        for (rowCount, colCount, header, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, header, SEED)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           header=header,
                                           hex_key=hex_key,
                                           timeoutSecs=60)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         offset=100,
                                         view=100)
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         offset=99,
                                         view=89)
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         offset=-1,
                                         view=53)
Esempio n. 32
0
    def test_from_import(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'

        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            # "covtype200x.data",
            # "100million_rows.csv",
            # "200million_rows.csv",
            # "a5m.csv",
            # "a10m.csv",
            # "a100m.csv",
            # "a200m.csv",
            # "a400m.csv",
            # "a600m.csv",
            # "billion_rows.csv.gz",
            # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25
            RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey,
                timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 33
0
    def test_exec2_int2cat_nested(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 10, 'cA', 100),
            (1000, 20, 'cB', 100),
            (1000, 30, 'cC', 100),
            (1000, 40, 'cD', 100),
            (1000, 10, 'cE', 100),
            (1000, 20, 'cF', 100),
            (1000, 30, 'cG', 100),
            (1000, 40, 'cH', 100),
        ]

        ### h2b.browseTheCloud()
        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
                '<keyX>[,<col2>] = <keyX>[,<col1>];',
                '<keyX>[,<col1>] = <keyX>[,1];',
                '<keyX>[,1] = <keyX>[,<col2>];',
                '<keyX>[,<col2>] = <keyX>[,<col1>];',
                '<keyX>[,<col1>] = <keyX>[,1];',
                '<keyX>[,1] = <keyX>[,<col2>];' \
                ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the exec commands across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(
                None,
                exprList,
                hex_key,
                maxCol=colCount,
                timeoutSecs=30,
                incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
    def test_import_multi_syn_datasets(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'

        print "This imports a folder of csv files..i.e points to syn_datasets with no regex"
        print "Doesn't put anything in syn_datasets. When run with import folder redirected"
        print "to import S3, there is a syn_datasets with 100 files"
        print "FIX! When run locally, I should have some multi-files in", importFolderPath, "/syn_datasets?" 
        timeoutSecs = 500
        if h2o.nodes[0].redirect_import_folder_to_s3_path:
            csvFilenameAll = [
                # FIX! ..just folder doesn't appear to work. add regex
                # need a destination_key...h2o seems to use the regex if I don't provide one
                ### "syn_datasets/*", 
                "syn_datasets/*_10000x200*", 
                ]
        else:
            csvFilenameAll = [
                # FIX! ..just folder doesn't appear to work. add regex
                # need a destination_key...h2o seems to use the regex if I don't provide one
                ### "syn_datasets/*", 
                "syn_datasets/*", 
                ]

        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2="syn_datasets.hex",
                timeoutSecs=500)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "from all files num_rows:", "{:,}".format(inspect['num_rows']), \
                "num_cols:", "{:,}".format(inspect['num_cols'])

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            kwargs = {'sample': 75, 'depth': 25, 'ntree': 1}
            start = time.time()
            RFview = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # so we can see!
            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            time.sleep(5)
Esempio n. 35
0
    def test_rf_kddcup_1999(self):
        # since we'll be waiting, pop a browser
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        csvFilename = 'kddcup_1999.data.gz'

        print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\
            "compared to running with the parameters specified and matching the browser RF query defaults. " +\
            "Also run the param for full scoring vs OOBE scoring."

        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=300)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None,parseKey['destination_key'])

        for trials in range(4):
            print "\n" + csvFilename, "Trial #", trials
            start = time.time()

            kwargs = {
                'response_variable': 'classifier',
                'ntree': 200,
                'gini': 1,
                'class_weights': None,
                'stratify': 0,
                # 'features': None,
                'features': 7,
                'ignore': None,
                'sample': 67,
                'bin_limit': 1024,
                'depth': 2147483647,
                'seed': 784834182943470027,
                'parallel': 1,
                'exclusive_split_limit': None,
                }

            if trials == 0:
                kwargs = {}
            elif trials == 1:
                kwargs['out_of_bag_error_estimate'] = None
            elif trials == 2:
                kwargs['out_of_bag_error_estimate'] = 0
            elif trials == 3:
                kwargs['out_of_bag_error_estimate'] = 1

            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=50,parseKey=parseKey, 
                timeoutSecs=300, retryDelaySecs=1.0, **kwargs)
            print "RF end on ", csvFilename, 'took', time.time() - start, 'seconds'

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
Esempio n. 36
0
    def test_GLM_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']
        tryList = [
            (10000,  100, 'cA', 100),
            (10000,  200, 'cB', 200),
            (10000,  300, 'cC', 300),
            ]

        ### h2b.browseTheCloud()

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'y': y, 
                'max_iter': 50, 
                'case': 1,
                'family': 'binomial',
                'lambda': 0,
                'alpha': 0,
                'max_iter': 50,
                'weight': 1.0,
                'thresholds': 0.5,
                'n_folds': 2,
                'beta_epsilon':1.0E-4,
            }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "y:", y 
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(10)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(10)
Esempio n. 37
0
    def test_GLM_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']
        tryList = [
            (10000,  100, 'cA', 100),
            (10000,  200, 'cB', 200),
            (10000,  300, 'cC', 300),
            ]

        ### h2b.browseTheCloud()

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'y': y, 
                'max_iter': 50, 
                'case': 1,
                'family': 'binomial',
                'lambda': 0,
                'alpha': 0,
                'max_iter': 50,
                'weight': 1.0,
                'thresholds': 0.5,
                'n_folds': 2,
                'beta_eps':1.0E-4,
            }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "y:", y 
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(10)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(10)
    def test_GLM_many_cols_tridist(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000,  10, 'cA', 300),
            (10000,  20, 'cB', 300),
            (10000,  30, 'cC', 300),
            (10000,  40, 'cD', 300),
            (10000,  50, 'cE', 300),
            (10000,  60, 'cF', 300),
            (10000,  70, 'cG', 300),
            (10000,  80, 'cH', 300),
            (10000,  90, 'cI', 300),
            (10000, 100, 'cJ', 300),
            (10000, 200, 'cK', 300),
            (10000, 300, 'cL', 300),
            (10000, 400, 'cM', 300),
            (10000, 500, 'cN', 300),
            (10000, 600, 'cO', 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "\nParse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]

            y = colCount
            kwargs = {'y': y}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 39
0
    def test_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000,  10, 'cA', 100),
            (1000,  20, 'cB', 100),
            (1000,  30, 'cC', 100),
            (1000,  40, 'cD', 100),
            (1000,  10, 'cE', 100),
            (1000,  20, 'cF', 100),
            (1000,  30, 'cG', 100),
            (1000,  40, 'cH', 100),
            ]

        ### h2b.browseTheCloud()
        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        if 1==0:
            exprList = [
                    '<keyX> = colSwap(<keyX>,<col1>,' +
                                 'colSwap(<keyX>,<col2>,' +
                                 'colSwap(<keyX>,<col1>,' +
                                 'colSwap(<keyX>,<col2>,' +
                                 '<keyX>[0]' +
                                 '))))',
                ]
        else:
            exprList = [
                    '<keyX> = colSwap(<keyX>,<col1>,' + 
                                 '<keyX>[0]' +
                                 ')',
                ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, 
                timeoutSecs=30, incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
    def test_parse_rand_utf8_angle_start(self):

        h2b.browseTheCloud()
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 1, 'cA', 120),
            (1000, 1, 'cG', 120),
            (1000, 1, 'cH', 120),
            ]

        print "What about messages to log (INFO) about unmatched quotes (before eol)"
        # got this ..trying to avoid for now
        # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO

        print "what we used"
        print "ordinalChoices:", ordinalChoices
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
        
            print "inspect:", h2o.dump_json(inspect)
            numRows = inspect['numRows']

            # Don't check for now..going to get empty rows
            # self.assertEqual(numRows, rowCount, msg='Wrong numRows likely due to unmatched " row going to NA: %s %s' % (numRows, rowCount))
            numCols = inspect['numCols']

            # because of our double quote termination hack above
            if DOUBLE_QUOTE:
                self.assertTrue((numCols==colCount or numCols==colCount+1), msg='Wrong numCols: %s %s' % (numCols, colCount))
            else:
                self.assertTrue(numCols==colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))

            for k in range(colCount):
                naCnt = inspect['cols'][k]['naCnt']
                self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))

                stype = inspect['cols'][k]['type']
                self.assertEqual("Enum", stype, msg='col %s type %s should be Enum' % (k, stype))

        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        time.sleep(5)
Esempio n. 41
0
    def test_B_putfile_files(self):
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameList = [
            ("covtype.data", 1),
            ("covtype20x.data", 1),
            # ("covtype200x.data", None),
            # ("a5m.csv", None),
            # ("a10m.csv", None),
            # ("a100m.csv", None),
            # ("a200m.csv", None),
            # ("a400m.csv", None),
            # ("a600m.csv", None),
            # ("100million_rows.csv,  None"),
            # ("200million_rows.csv", None),
            # ("billion_rows.csv.gz", 1),
            # memory issue on one machine. no RF
            # ("new-poker-hand.full.311M.txt.gz", None),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, trees) in csvFilenameList:
            csvPathname = h2o.find_file('/home/0xdiag/datasets/' + csvFilename)

            # creates csvFilename and csvFilename.hex  keys
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=csvFilename, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseKey=parseKey,
                    timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
    def test_GLM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 5, 'cA', 300), 
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, \
                "using random coefficients and intercept and logit eqn. for output"
            (coefficients, intercept) = gen_rand_equation(colCount, SEEDPERFILE)
            print coefficients, intercept
            write_syn_dataset(csvPathname, rowCount, colCount, coefficients, intercept, SEEDPERFILE)


            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {'y': y, 'max_iter': 60, 
                    'lambda': 1e-4,
                    'alpha': 0,
                    'weight': 1.0,
                    # what about these?
                    # 'link': [None, 'logit','identity', 'log', 'inverse'],
                    'n_folds': 3,
                    'beta_epsilon': 1e-4,
                    'thresholds': 0.5,
                    }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(5)
Esempio n. 43
0
    def test_many_cols_long_enums(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            SEPARATOR = ord(',')
            parseResult = h2i.import_parse(
                path=csvPathname,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=10,
                header=0,
                separator=SEPARATOR
            )  # don't force header..we have NAs in the rows, and NAs mess up headers
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
Esempio n. 44
0
    def test_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (100, 11000, 0, 'cA', 180),
            # (100, 10000, 1, 'cB', 180),
            # (100, 8000, 1, 'cD', 180),
            # (100, 7000, 0, 'cE', 180),
            # (100, 6000, 1, 'cF', 180),
            (100, 1000, 0, 'cH', 120),
            (100, 1000, 1, 'cI', 120),
            (100, 2000, 1, 'cI', 120),
            (100, 3000, 1, 'cI', 120),
            (100, 4000, 1, 'cI', 120),
            (100, 5000, 0, 'cG', 180),
            (100, 9000, 0, 'cC', 180),
            (100, 10000, 1, 'cB', 180),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        # it's interesting to force the first enum row to be used as header or not
        # with many cols, we tend to hit limits about stuff fitting in a chunk (header or data)
        for (rowCount, colCount, header, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, header, SEED)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=header, 
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=100, view=100)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=99, view=89)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=-1, view=53)
Esempio n. 45
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "TEST-poker1000.csv",
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs(
            path='/datasets', 
            schema='maprfs')

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(
                csvFilename=csvFilename, 
                path='/datasets', 
                schema='maprfs', 
                timeoutSecs=1000)

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
            (5, 100, 'cA', 5),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            SEPARATOR = ord(',')
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=10,
                                         separator=SEPARATOR,
                                         header=1)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
Esempio n. 47
0
    def test_GLM_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']
        tryList = [
            (10000,  100, 'cA', 100),
            (10000,  200, 'cB', 200),
            (10000,  300, 'cC', 300),
            ]

        ### h2b.browseTheCloud()

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]

            y = colCount
            kwargs = {'y': y, 'max_iter': 50, 'case': 1}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            # only col Y-1 (next to last)doesn't get renamed in coefficients due to enum/categorical expansion
            print "y:", y 
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(15)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(15)
Esempio n. 48
0
def pollWaitJobs(pattern=None, timeoutSecs=30, pollTimeoutSecs=30, retryDelaySecs=5, benchmarkLogging=None):
    anyBusy = True
    waitTime = 0
    while anyBusy:
        # timeout checking has to move in here now! just count loops
        anyBusy = False
        a = h2o.nodes[0].jobs_admin(timeoutSecs=pollTimeoutSecs)
        ## print "jobs_admin():", h2o.dump_json(a)
        jobs = a["jobs"]
        patternKeys = []
        for j in jobs:
            ### h2o.verboseprint(j)
            # save the destination keys for any GLMModel in progress
            if pattern and pattern in j["destination_key"]:
                patternKeys.append(j["destination_key"])

            if j["end_time"] == "":
                anyBusy = True
                h2o.verboseprint(
                    "waiting",
                    waitTime,
                    "secs, still not done - ",
                    "destination_key:",
                    j["destination_key"],
                    "progress:",
                    j["progress"],
                    "cancelled:",
                    j["cancelled"],
                    "end_time:",
                    j["end_time"],
                )

        h2b.browseJsonHistoryAsUrlLastMatch("Jobs")
        if anyBusy and waitTime > timeoutSecs:
            print h2o.dump_json(jobs)
            raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds")

        sys.stdout.write(".")
        sys.stdout.flush()
        time.sleep(retryDelaySecs)
        waitTime += retryDelaySecs

        # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack)
        # test would pass ['cpu','disk','jstack'] kind of list
        if benchmarkLogging:
            h2o.cloudPerfH2O.get_log_save(benchmarkLogging)
    return patternKeys
Esempio n. 49
0
    def test_rf_multinomial_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_multinomial.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 400
        colCount = 7

        for trial in range(5):
            write_syn_dataset(csvPathname, totalRows, colCount, headerData)
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hexKey = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hexKey,
                                           doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   timeoutSecs=15,
                                   pollTimeoutSecs=5,
                                   **kwargs)
            print "trial #", trial, 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            modelKey = rfView['drf_model']['_key']
            h2o_cmd.runScore(dataKey=parseResult['destination_key'],
                             modelKey=modelKey,
                             vactual=colCount + 1,
                             vpredict=1,
                             expectedAuc=0.5,
                             doAUC=False)

            h2b.browseJsonHistoryAsUrlLastMatch("RF")
Esempio n. 50
0
    def test_rf_from_import_hosts(self):

        # just do the import folder once
        timeoutSecs = 500
        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        csvFilenameList = [
            "billion_rows.csv.gz",
            # "covtype20x.data",
        ]

        importFolderPath = "standard"
        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           timeoutSecs=500)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25
            RFview = h2o_cmd.runRF(trees=1,
                                   depth=25,
                                   parseResult=parseResult,
                                   timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush()
Esempio n. 51
0
    def test_rf_tnc3_fvec(self):
        h2o.beta_features = True
        csvPathname = 'tnc3.csv'
        print "\n" + csvPathname
        hex_key = "tnc3.hex"
        ### h2b.browseTheCloud()

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', 
            timeoutSecs=10, retryDelaySecs=0.25, header=1)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")

        if 1==1:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser"
            print 'The good case with ignore="boat,body"'
            rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, retryDelaySecs=0.25,
                ignored_cols_by_name="boat,body")

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrl(retryDelaySecs=0.5)

        #******************
        if 1==0:
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10, retryDelaySecs=0.25)
            print "\ncolResultList after char swap", colResultList

        if 1==1:
            print "\nNow the bad case (no ignore)"
            rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, retryDelaySecs=0.25)

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrl(retryDelaySecs=0.5)
Esempio n. 52
0
def pollWaitJobs(pattern=None,
                 timeoutSecs=30,
                 pollTimeoutSecs=30,
                 retryDelaySecs=5,
                 benchmarkLogging=None):
    anyBusy = True
    waitTime = 0
    while (anyBusy):
        # timeout checking has to move in here now! just count loops
        anyBusy = False
        a = h2o.nodes[0].jobs_admin(timeoutSecs=pollTimeoutSecs)
        ## print "jobs_admin():", h2o.dump_json(a)
        jobs = a['jobs']
        patternKeys = []
        for j in jobs:
            ### h2o.verboseprint(j)
            # save the destination keys for any GLMModel in progress
            if pattern and pattern in j['destination_key']:
                patternKeys.append(j['destination_key'])

            if j['end_time'] == '':
                anyBusy = True
                h2o.verboseprint("waiting", waitTime, "secs, still not done - ",\
                    "destination_key:", j['destination_key'], \
                    "progress:",  j['progress'], \
                    "cancelled:", j['cancelled'],\
                    "end_time:",  j['end_time'])

        h2b.browseJsonHistoryAsUrlLastMatch("Jobs")
        if (anyBusy and waitTime > timeoutSecs):
            print h2o.dump_json(jobs)
            raise Exception("Some queued jobs haven't completed after",
                            timeoutSecs, "seconds")

        sys.stdout.write('.')
        sys.stdout.flush()
        time.sleep(retryDelaySecs)
        waitTime += retryDelaySecs

        # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack)
        # test would pass ['cpu','disk','jstack'] kind of list
        if benchmarkLogging:
            h2o.cloudPerfH2O.get_log_save(benchmarkLogging)
    return patternKeys
Esempio n. 53
0
    def test_many_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cI', 5),
            (100, 5000, 'cA', 5),
            (100, 6000, 'cB', 5),
            (100, 7000, 'cC', 5),
            (100, 8000, 'cD', 5),
            (100, 8200, 'cE', 5),
            (100, 8500, 'cF', 5),
            (100, 9000, 'cG', 5),
            (100, 11000, 'cH', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=100, view=100)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=99, view=89)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=-1, view=53)
Esempio n. 54
0
    def test_KMeans_twit(self):
        csvFilename = "Twitter2DB.txt"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/' + csvFilename)

        # h2b.browseTheCloud()
        # parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", separator=9) # force tab sep
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        # loop, to see if we get same centers
        # should check the means?
        # FIX! have to fix these to right answers
        expected = [
            # expected centers are from R. rest is just from h2o
            ([310527.2, 13433.89], 11340, None),
            ([5647967.1, 40487.76], 550, None),
            ([21765291.7, 93129.26], 14, None),
        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            kwargs = {
                'k': 3,
                'max_iter': 50,
                'epsilon': 1e-4,
                'normalize': 0,
                'cols': '0,1',
                'initialization': 'Furthest',
                # 'initialization': 'PlusPlus',
                'destination_key': 'kmeans_dest_key',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=5,
                                           **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            if 1 == 0:
                h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore")
                h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply")
                h2b.browseJsonHistoryAsUrlLastMatch("KMeans")
                time.sleep(3600)

            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)
Esempio n. 55
0
    def test_tnc3_ignore(self):
        csvFilename = 'tnc3.csv'
        csvPathname = h2o.find_file('smalldata/' + csvFilename)
        print "\n" + csvPathname
        key2 = "tnc3.hex"
        h2b.browseTheCloud()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10, header=1)
        print "Parse result['Key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if 1==1:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser"
            print 'The good case with ignore="boat,body"'
            rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, ignore="boat,body", csvPathname=csvPathname)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if 1==0:
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if 1==1:
            print "\nNow the bad case (no ignore)"
            rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, csvPathname=csvPathname)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Esempio n. 56
0
    def test_GLM_twovalues(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        # H2O might not do whitespace stripping on numbers correctly, when , is {SEP}
        # GLM will auto expand categoricals..so if we have more coefficients than expected
        # that means it didn't parse right
        # mix in space/tab combos
        # just done like this for readability
        rowDataTrueRaw = \
            "<sp>1,\
            0<sp>,\
            <tab>65,\
            1<tab>,\
            <sp><tab>2,\
            1<sp><tab>,\
            <tab><sp>1,\
            4<tab><sp>,\
            <tab><tab>1,\
            4<tab><tab>,\
            <sp><sp>1,\
            4<sp><sp>"

        rowDataTrue = re.sub("<sp>"," ", rowDataTrueRaw)
        rowDataTrue = re.sub("<tab>","  ", rowDataTrue)

        rowDataFalse = \
            "0,\
            1,\
            0,\
            -1,\
            -2,\
            -1,\
            -1,\
            -4,\
            -1,\
            -4,\
            -1,\
            -4"

        twoValueList = [
            ('A','B',0, 14),
            ('A','B',1, 14),
            (0,1,0, 12),
            (0,1,1, 12),
            (0,1,'NaN', 12),
            (1,0,'NaN', 12),
            (-1,1,0, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            start = time.time()
            key = csvFilename + "_" + str(trial)
            kwargs = {'case': case, 'y': 10, 'family': 'binomial', 'alpha': 0, 'beta_eps': 0.0002}

            # default takes 39 iterations? play with alpha/beta
            glm = h2o_cmd.runGLM(csvPathname=csvPathname, key=key)
            h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs)

            # check that the number of entries in coefficients is right (12 with intercept)
            coeffNum = len(glm['GLMModel']['coefficients'])
            if (coeffNum!=coeffNum):
                raise Exception("Should be " + coeffNum + " coefficients in result. %s" % coeffNum)

            print "trial #", trial, "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")
            h2o.check_sandbox_for_errors()
            trial += 1
Esempio n. 57
0
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'covtype.data',
            ]
        else:
            csvFilenameList = [
                'covtype200x.data',
                'covtype200x.data',
                'covtype.data',
                'covtype.data',
                'covtype20x.data',
                'covtype20x.data',
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=2000,
                                     **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations,
                                          validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients,
                                          coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush()
Esempio n. 58
0
    def test_GLM_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000, 10, 'cA.hex', 100),
            (10000, 20, 'cB.hex', 200),
            (10000, 30, 'cC.hex', 300),
            (10000, 40, 'cD.hex', 400),
            (10000, 50, 'cE.hex', 500),
        ]

        ### h2b.browseTheCloud()

        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
            '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))',
            ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])',
        ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=90)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(
                None,
                exprList,
                key2,
                maxCol=colCount,
                timeoutSecs=90,
                incrementingResult=False)
            print "\nexec colResultList", colResultList

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]
            # since we add the output twice, it's no longer colCount-1
            y = colCount
            kwargs = {'y': y, 'max_iter': 50, 'case': 1}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            # only col y-1 (next to last)doesn't get renamed in coefficients
            # due to enum/categorical expansion
            print "y:", y
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(3)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)