Example #1
0
    def test_GLM_gamma_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': "gamma",
                'alpha': 0.5,
                'lambda': 1e-4,
                'max_iter': 24
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120,
                                     parseKey=parseKey,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_1mx10_hastie_10_2_cat_and_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        kmeans_doit(self, csvFilename, csvPathname, num_rows=1000000, timeoutSecs=60)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        kmeans_doit(self, filename2xShuf, pathname2xShuf, num_rows=2000000, timeoutSecs=90)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x)
        kmeans_doit(self, filename4x, pathname4x, num_rows=4000000, timeoutSecs=120)
Example #3
0
    def test_rf_covtype_train_full(self):
        csvFilename = 'train.csv'
        csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename)
        print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n"
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex",
                                     header=1,
                                     timeoutSecs=180)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            rfView = h2o_cmd.runRF(csvPathname=csvPathname,
                                   timeoutSecs=timeoutSecs,
                                   **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix'][
                'classification_error']
            self.assertLess(
                classification_error, 0.02,
                "train.csv should have full classification error <0.02")

            print "Trial #", trial, "completed"
Example #4
0
    def test_putfile_a5m(self):
        timeoutSecs = 500
        csvFilenameList = [
            # use different names for each parse 
            # doesn't fail if gzipped?
            ("a5m.csv", 'A', None),
            ("a5m.csv", 'B', None),
            ("a5m.csv", 'C', None),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, key, trees) in csvFilenameList:
            csvPathname = h2o.find_dataset(csvFilename)

            # creates csvFilename and csvFilename.hex  keys
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseKey=parseKey,
                    timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Example #5
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED =
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + (
             (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15) *
             (kwargs['parallel'] and 1 or 3))
         start = time.time()
         h2o_cmd.runRF(timeoutSecs=timeoutSecs,
                       retryDelaySecs=1,
                       csvPathname=csvPathname,
                       **kwargs)
         elapsed = time.time() - start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
             (elapsed * 100) / timeoutSecs)
Example #6
0
    def test_GLM_gaussian_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': "gamma",
                'alpha': 0.5,
                'lambda': 1e-4,
                'max_iter': 10
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120,
                                     parseKey=parseKey,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_GLM_gaussian_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'num_cross_validation_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds'

            print "Trial #", trial, "completed\n"
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54, 
                'num_cross_validation_folds' : 3, 
                'family' : 'binomial', 
                'max_iter' : 5, 
                'case': 1, 
                'alpha': 0, 
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=150, parseKey=parseKey, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_GLM_params_rand2_4082088627997819015(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54, 
                'n_folds' : 3, 
                'family' : 'binomial', 
                'max_iter' : 5, 
                'case': 1, 
                'alpha': 0, 
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10)
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            elapsed = time.time() - start
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()
            
            print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Trial #", trial, "completed\n"
Example #10
0
 def test_rf_strata_fail(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     timeoutSecs = 60
     kwargs = {
         'response_variable': 54,
         'ntree': 50,
         'features': '',
         'depth': 2147483647,
         'stat_type': 'ENTROPY',
         'ignore': '',
         'class_weights': '1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0',
         'sampling_strategy': 'RANDOM',
         'strata_samples':
         'undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined',
         'sample': '67',
         'out_of_bag_error_estimate': 1,
         'model_key': '',
         'bin_limit': 1024,
         'seed': 784834182943470027,
         'parallel': 1,
         'exclusive_split_limit': '',
         'iterative_cm': 1,
         'use_non_local_data': 0,
     }
     h2o_cmd.runRF(timeoutSecs=timeoutSecs,
                   csvPathname=csvPathname,
                   **kwargs)
Example #11
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)

        y = "10"
        x = ""
        kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}
        (modelKey, validations1) = glm_doit(self, csvFilename, csvPathname, 
            timeoutSecs=60, pollTimeoutSecs=60, **kwargs)

        print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x"

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_score(self,filename2x, pathname2x, modelKey, thresholds="0.5",
            timeoutSecs=60, pollTimeoutSecs=60)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x,pathname2x,pathname4x)
        
        print "Iterating 3 times on this last one"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_score(self,filename4x, pathname4x, modelKey, thresholds="0.5",
                timeoutSecs=60, pollTimeoutSecs=60)
Example #12
0
    def test_GLM_poisson_1(self):
        csvFilename = 'covtype.data'
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        if (1 == 0):
            print "WARNING: just doing the first 33 features, for comparison to ??? numbers"
            # pythonic!
            x = ",".join(map(str, range(33)))
        else:
            x = ""

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'x': x,
            'y': y,
            'family': 'poisson',
            'link': 'log',
            'n_folds': 0,
            'max_iter': max_iter,
            'beta_epsilon': 1e-3
        }

        timeoutSecs = 120
        # L2
        start = time.time()
        kwargs.update({'alpha': 0, 'lambda': 0})
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
        print "glm (L2) end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

        # Elastic
        kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
        print "glm (Elastic) end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

        # L1
        kwargs.update({'alpha': 1, 'lambda': 1e-4})
        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
        print "glm (L1) end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
    def test_GLM_params_rand2_4082088627997819015(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': 'binomial',
                'max_iter': 5,
                'case': 1,
                'alpha': 0,
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            timeoutSecs = max(150,
                              params['n_folds'] * 10 + params['max_iter'] * 10)
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs,
                                     parseKey=parseKey,
                                     **kwargs)
            elapsed = time.time() - start
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()

            print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Trial #", trial, "completed\n"
Example #14
0
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 54, 
                'n_folds': 3, 
                'family': "poisson", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'beta_epsilon': 0.001, 
                'max_iter': 15,
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['n_folds']*20)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Example #15
0
    def test_glm_covtype_single_cols(self):
        timeoutSecs = 10
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        print "\n" + csvPathname

        # columns start at 0
        y = "54"
        x = ""
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        print "GLM binomial wth 1 X column at a time"
        print "Result check: abs. value of coefficient and intercept returned are bigger than zero"
        for colX in xrange(54):
            if x == "":
                x = str(colX)
            else:
                # x = x + "," + str(colX)
                x = str(colX)

            sys.stdout.write('.')
            sys.stdout.flush()
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
    def test_glm_covtype_single_cols(self):
        timeoutSecs = 10
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        print "\n" + csvPathname

        # columns start at 0
        y = "54"
        x = ""
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        print "GLM binomial wth 1 X column at a time" 
        print "Result check: abs. value of coefficient and intercept returned are bigger than zero"
        for colX in xrange(54):
            if x == "": 
                x = str(colX)
            else:
                # x = x + "," + str(colX)
                x = str(colX)

            sys.stdout.write('.')
            sys.stdout.flush() 
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
Example #17
0
    def test_1mx10_hastie_10_2_cat_and_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self, csvFilename, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        glm_doit(self, filename2xShuf, pathname2xShuf, timeoutSecs=45)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x)
        glm_doit(self, filename4x, pathname4x, timeoutSecs=120)
Example #18
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self,csvFilename, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self,filename2x, pathname2x, timeoutSecs=45)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x,pathname2x,pathname4x)
        
        print "Iterating 3 times on this last one for perf compare"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_doit(self,filename4x, pathname4x, timeoutSecs=60)
Example #19
0
    def test_exec_filter_slice2(self):
        timeoutSecs = 10
        csvFilename = "covtype.data"
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        key2 = "c"
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c',
                                     10)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['desination_key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        for trial in range(10):
            print "Doing the execs in order, to feed filters into slices"
            nodeX = 0
            for exprTemplate in exprList:
                execExpr = h2e.fill_in_expr_template(exprTemplate,
                                                     colX=0,
                                                     n=0,
                                                     row=1,
                                                     key2=key2,
                                                     m=2)
                time.sleep(2)
                h2o.check_sandbox_for_errors()

                execResultInspect, min_value = h2e.exec_expr(
                    h2o.nodes[nodeX],
                    execExpr,
                    resultKey="Result.hex",
                    timeoutSecs=4)
                print "min_value:", min_value, "execExpr:", execExpr
                h2o.verboseprint("min: ", min_value, "trial:", trial)
Example #20
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self,csvFilename, csvPathname, timeoutSecs=75)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self,filename2x, pathname2x, timeoutSecs=75)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x,pathname2x,pathname4x)
        
        print "Iterating 3 times on this last one for perf compare"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_doit(self,filename4x, pathname4x, timeoutSecs=150)
Example #21
0
    def test_rf_params_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        kwargs = {
            'response_variable': 54, 
            'features': 7, 
            'sampling_strategy': 'STRATIFIED_LOCAL', 
            'out_of_bag_error_estimate': 1, 
            'strata_samples': '1=10,2=99,3=99,4=99,5=99,6=99,7=99', 
            'bin_limit': None, 
            'seed': '11111', 
            'model_key': '012345', 
            'ntree': 13, 
            'parallel': 1
        }
        for trial in range(2):

            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
            start = time.time()
            rfv = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
            elapsed = time.time()-start

            cm = rfv['confusion_matrix']
            classification_error = cm['classification_error']
            rows_skipped = cm['rows_skipped']

            # just want to catch the nan case when all rows are skipped
            self.assertLess(rows_skipped, 581012)
            self.assertLess(classification_error, 100) # error if nan
            print "Trial #", trial, "completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Example #22
0
    def test_rf_covtype_train_oobe(self):
        if (1 == 0):
            csvFilename = 'train.csv'
            csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename)
            print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key2=csvFilename + ".hex",
                                         header=1,
                                         timeoutSecs=180)
            # FIX! maybe try specifying column header with column name
            ### kwargs['response_variable'] = A55
        else:
            csvFilename = 'covtype.data'
            print "\nUsing header=0 on the normal covtype.data"
            csvPathname = h2o.find_dataset(
                'UCI/UCI-large/covtype/covtype.data')
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key2=csvFilename + ".hex",
                                         header=0,
                                         timeoutSecs=180)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix'][
                'classification_error']
            self.assertGreater(
                classification_error, 0.01,
                "train.csv should have out of bag error estimate greater than 0.01"
            )

            print "Trial #", trial, "completed"
Example #23
0
    def test_rand_inspect(self):
        ### h2b.browseTheCloud()
        csvFilename = 'covtype.data'
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename)
        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(None,
                                     csvPathname,
                                     key=csvFilename,
                                     timeoutSecs=10)
        destination_key = parseKey['destination_key']
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", destination_key

        def inspect_and_check(nodeX,
                              destination_key,
                              offset,
                              view,
                              inspect=None):
            inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX],
                                            destination_key,
                                            offset=offset,
                                            view=view)
            # FIX! get min/max/mean/variance for a col too?
            constantNames = [
                'num_cols',
                'num_rows',
            ]
            if inspect is not None:
                for i in constantNames:
                    self.assertEqual(inspect[i], inspectNew[i])

            return inspectNew

        # going to use this to compare against future. num_rows/num_cols should always
        # be the same, regardless of the view. just a coarse sanity check
        origInspect = inspect_and_check(0, destination_key, 0, 1)
        h2o.verboseprint(h2o.dump_json(origInspect))

        num_rows = origInspect['num_rows']
        num_cols = origInspect['num_cols']

        lenNodes = len(h2o.nodes)
        for i in range(1000):
            # we want to use the boundary conditions, so have two level of random choices
            offset = good_choices(num_rows)
            view = good_choices(num_cols)
            # randomize the node used
            nodeX = random.randint(0, lenNodes - 1)
            print "nodeX:", nodeX, "offset:", offset, "view:", view
            inspect_and_check(nodeX, destination_key, offset, view,
                              origInspect)

            # do it again, once in a while
            r = random.randint(0, 10)
            if (r == 0):
                inspect_and_check(nodeX, destination_key, offset, view,
                                  origInspect)
Example #24
0
 def test_poker_xlsx(self):
     # maybe can get stuck during polling for parse progress?
     # break it out for pollTimeoutSecs
     parseKey = h2o_cmd.parseFile(
         None,
         h2o.find_dataset('poker/poker-hand-testing.xlsx'),
         timeoutSecs=120,
         pollTimeoutSecs=60)
     h2o_cmd.runRFOnly(None, parseKey=parseKey, timeoutSecs=120)
 def test_rf_params_rand2_7066883810153380318(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 23, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
         print "Trial #", trial, "completed"
Example #26
0
 def test_loop_random_param_covtype(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
         print "Trial #", trial, "completed"
Example #27
0
    def test_GLM_covtype(self):
        csvFilename = 'covtype.data'
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        if (1==0):
            print "WARNING: just doing the first 33 features, for comparison to allstate numbers"
            # pythonic!
            x = ",".join(map(str,range(33)))
        else:
            x = ""

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"

        # L2 
        args = {
            'x': x,
            'y': y,
            'family': 'binomial',
            'link': 'logit',
            'num_cross_validation_folds': 0,
            'case_mode': '=',
            'case': 1,
            'max_iter': max_iter,
            'beta_eps': 1e-3}

        timeoutSecs = 120

        start = time.time()
        kwargs.update({'alpha': 0, 'lambda': 0})
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

        # Elastic
        kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

        # L1
        kwargs.update({'alpha': 1, 'lambda': 1e-4})
        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
Example #28
0
    def test_loop_random_exec_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 15)
        print "\nParse key is:", parseKey['destination_key']

        h2b.browseTheCloud()
        h2e.exec_zero_list(zeroList)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex',
            maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=5)
        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
Example #29
0
 def test_rf_params_rand2(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         start = time.time()
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Example #30
0
    def test_rf_covtype_train_oobe(self):
        if (1==0):
            csvFilename = 'train.csv'
            csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename)
            print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, 
                timeoutSecs=180)
            # FIX! maybe try specifying column header with column name
            ### kwargs['response_variable'] = A55
        else:
            csvFilename = 'covtype.data'
            print "\nUsing header=0 on the normal covtype.data"
            csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=0, 
                timeoutSecs=180)


        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2o_cmd.info_from_inspect(inspect, csvPathname)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix']['classification_error']
            self.assertGreater(classification_error, 0.01, 
                "train.csv should have out of bag error estimate greater than 0.01")

            print "Trial #", trial, "completed"
Example #31
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self, csvFilename, csvPathname, timeoutSecs=300)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)
        glm_doit(self, filename2x, pathname2x, timeoutSecs=300)
Example #32
0
    def test_GLM_gaussian_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'n_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Example #33
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self,csvFilename, csvPathname, timeoutSecs=300)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self,filename2x, pathname2x, timeoutSecs=300)
Example #34
0
    def test_rand_inspect(self):
        ### h2b.browseTheCloud()
        csvFilename = 'covtype.data'
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/'+ csvFilename)
        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(None, csvPathname, key=csvFilename, timeoutSecs=10)
        destination_key = parseKey['destination_key']
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", destination_key 

        def inspect_and_check(nodeX,destination_key,offset,view,inspect=None):
            inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view)
            # FIX! get min/max/mean/variance for a col too?
            constantNames = [
                'num_cols',
                'num_rows',
                ]
            if inspect is not None:
                for i in constantNames:
                    self.assertEqual(inspect[i], inspectNew[i])

            return inspectNew

        # going to use this to compare against future. num_rows/num_cols should always
        # be the same, regardless of the view. just a coarse sanity check
        origInspect = inspect_and_check(0,destination_key,0,1)
        h2o.verboseprint(h2o.dump_json(origInspect))

        num_rows = origInspect['num_rows']
        num_cols = origInspect['num_cols']

        lenNodes = len(h2o.nodes)
        for i in range (1000):
            # we want to use the boundary conditions, so have two level of random choices
            offset = good_choices(num_rows)
            view = good_choices(num_cols)
            # randomize the node used
            nodeX = random.randint(0,lenNodes-1)
            print "nodeX:", nodeX, "offset:", offset, "view:", view
            inspect_and_check(nodeX,destination_key,offset,view,origInspect)

            # do it again, once in a while
            r = random.randint(0,10)
            if (r==0):
                inspect_and_check(nodeX,destination_key,offset,view,origInspect)
Example #35
0
    def test_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' +
                                      filename1x)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, key2, timeoutSecs, resultMult) in csvAll:
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key2=key2,
                                         timeoutSecs=2000)
            print "Parse result['Key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(
                lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
 def test_loop_random_param_covtype(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     # for determinism, I guess we should spit out the seed?
     ##### SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     SEED = 4201285065147091758
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
         print "Trial #", trial, "completed"
Example #37
0
    def test_B_putfile_files(self):
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data",
        #    "billion_rows.csv.gz",
        csvFilenameList = [
            ("covtype.data", 'UCI/UCI-large/covtype/covtype.data', 1),
        ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, datasetPath, trees) in csvFilenameList:
            csvPathname = h2o.find_dataset(datasetPath)

            # creates csvFilename and csvFilename.hex  keys
            node = h2o.nodes[0]
            key = node.put_file(csvPathname,
                                key=csvFilename,
                                timeoutSecs=timeoutSecs)
            # not using parseFile...used to be a bug if we inspect the file we just put
            # so we test that
            inspect1 = h2o_cmd.runInspect(key=csvFilename)

            parseKey = node.parse(key, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            # We should be able to see the parse result?
            inspect2 = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,
                                           depth=25,
                                           parseKey=parseKey,
                                           timeoutSecs=timeoutSecs)

            sys.stdout.write('.')
            sys.stdout.flush()
    def test_rf_params_rand2_7066883810153380318(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        # SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        SEED = 7066883810153380318
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        for trial in range(10):
            # params is mutable. This is default.
            params = {'ntree': 23, 'parallel': 1, 'features': 7}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
            h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
            print "Trial #", trial, "completed"
    def test_rf_params_rand2_7066883810153380318(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        # SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        SEED = 7066883810153380318
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            params = {'ntree': 23, 'parallel': 1}
            colX = h2o_rf.pickRandRfParams(paramDict, params)
            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 30 + kwargs['ntree'] * 10 *  (kwargs['parallel'] and 1 or 3)
            h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
            print "Trial #", trial, "completed"
Example #40
0
    def test_loop_random_exec_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data',
                                     'c.hex', 15)
        print "\nParse key is:", parseKey['destination_key']

        h2b.browseTheCloud()
        h2e.exec_zero_list(zeroList)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes),
                                exprList,
                                'c.hex',
                                maxCol=54,
                                maxRow=400000,
                                maxTrials=200,
                                timeoutSecs=15)
        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data", 'took', time.time(
        ) - start, 'seconds'
Example #41
0
    def test_loop_random_param_covtype(self):
        start = time.time()
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        print "upload/parse end on ", csvPathname, 'took', time.time() - start, 'seconds'

        kwargs = define_params()
        for trial in range(3):
            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['n_folds']*20)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))
            
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Example #42
0
 def test_rf_params_rand2(self):
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + (
             (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15) *
             (kwargs['parallel'] and 1 or 3))
         start = time.time()
         h2o_cmd.runRF(timeoutSecs=timeoutSecs,
                       retryDelaySecs=1,
                       csvPathname=csvPathname,
                       **kwargs)
         elapsed = time.time() - start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
             (elapsed * 100) / timeoutSecs)
Example #43
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED = 
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(20):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         # seems ec2 can be really slow
         timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10)
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         print "Trial #", trial, "completed"
Example #44
0
 def test_rf_params_rand2(self):
     # for determinism, I guess we should spit out the seed?
     # random.seed(SEED)
     SEED = random.randint(0, sys.maxint)
     # if you have to force to redo a test
     # SEED = 
     random.seed(SEED)
     print "\nUsing random seed:", SEED
     csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     for trial in range(10):
         # params is mutable. This is default.
         params = {'ntree': 13, 'parallel': 1, 'features': 7}
         colX = h2o_rf.pickRandRfParams(paramDict, params)
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3))
         start = time.time()
         h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Example #45
0
    def test_exec_covtype_cols(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 10)
        print "\nParse key is:", parseKey['destination_key']

        ### h2b.browseTheCloud()
        start = time.time()
        # passes with suffix, fails without?
        # suffix = ""
        suffix = ".hex"
        print "Using .hex suffix everywhere until we get type checking in H2O.." + \
              "Fails with first size=1 otherwise"
        for i in range(54):
            execExpr = "Result" + str(i) + suffix + " = c.hex[" + str(i) + "]"
            print "execExpr:", execExpr
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result" + str(i) + suffix, 
                timeoutSecs=4)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
Example #46
0
    def test_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' + filename1x)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)

        csvAll = [
            (pathname1x, "cA", 5,  1),
            (pathname2x, "cB", 5,  2),
            (pathname2x, "cC", 5,  2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, key2, timeoutSecs, resultMult) in csvAll:
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=2000)
            print "Parse result['Key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, 
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
Example #47
0
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 54, 
                'num_cross_validation_folds': 3, 
                'family': "poisson", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'beta_epsilon': 0.001, 
                'max_iter': 30
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['num_cross_validation_folds']*20)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Example #48
0
    def test_exec_filter_slice(self):
        timeoutSecs = 10
        csvFilename = "covtype.data"
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        key2 = "c"
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c', 10)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['desination_key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        for trial in range(10):
            print "Doing the execs in order, to feed filters into slices"
            nodeX = 0
            for exprTemplate in exprList:
                execExpr = h2e.fill_in_expr_template(exprTemplate, colX=0, n=0, row=1, key2=key2, m=2)

                execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, 
                    resultKey="Result.hex", timeoutSecs=4)

                print "min_value:", min_value, "execExpr:", execExpr
                h2o.verboseprint("min: ", min_value, "trial:", trial)
Example #49
0
    def test_putfile_a5m(self):
        timeoutSecs = 500
        csvFilenameList = [
            # use different names for each parse
            # doesn't fail if gzipped?
            ("a5m.csv", 'A', None),
            ("a5m.csv", 'B', None),
            ("a5m.csv", 'C', None),
        ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, key, trees) in csvFilenameList:
            csvPathname = h2o.find_dataset(csvFilename)

            # creates csvFilename and csvFilename.hex  keys
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key=key,
                                         timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,
                                           depth=25,
                                           parseKey=parseKey,
                                           timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush()
Example #50
0
    def test_GLM_gamma_fail1(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        for trial in range(5):
            kwargs = {
                'standardize': 1, 
                'family': 'gamma', 
                'link': 'familyDefault', 
                'y': 54, 
                'lambda': 0.0001,
                'alpha': 0.5, 
                'max_iter': 25, 
                'n_folds': 1, 
            }
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
            # h2o_glm.simpleCheckGLM(self, glm, None, maxExpectedIterations=kwargs['max_iter']-2, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, None, **kwargs)
            print "Trial #", trial, "completed\n"
Example #51
0
    def test_B_putfile_files(self):
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameList = [
            ("covtype.data", 'UCI/UCI-large/covtype/covtype.data', 1),
            ]
        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for (csvFilename, datasetPath, trees) in csvFilenameList:
            csvPathname = h2o.find_dataset(datasetPath)

            # creates csvFilename and csvFilename.hex  keys
            node = h2o.nodes[0]
            key = node.put_file(csvPathname, key=csvFilename, timeoutSecs=timeoutSecs)
            # not using parseFile...used to be a bug if we inspect the file we just put
            # so we test that
            inspect1 = h2o_cmd.runInspect(key=csvFilename)

            parseKey = node.parse(key, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            # We should be able to see the parse result?
            inspect2 = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # constrain depth to 25
            if trees is not None:
                RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseKey=parseKey,
                    timeoutSecs=timeoutSecs)

            sys.stdout.write('.')
            sys.stdout.flush() 
Example #52
0
    def test_exec_covtype_cols(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data',
                                     'c.hex', 10)
        print "\nParse key is:", parseKey['destination_key']

        ### h2b.browseTheCloud()
        start = time.time()
        # passes with suffix, fails without?
        # suffix = ""
        suffix = ".hex"
        print "Using .hex suffix everywhere until we get type checking in H2O.." + \
              "Fails with first size=1 otherwise"
        for i in range(54):
            execExpr = "Result" + str(i) + suffix + " = c.hex[" + str(i) + "]"
            print "execExpr:", execExpr
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey="Result" + str(i) + suffix,
                          timeoutSecs=4)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data", 'took', time.time(
        ) - start, 'seconds'
    def test_rf_covtype_train_full(self):
        csvFilename = 'train.csv'
        csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename)
        print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n"
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, 
            timeoutSecs=180)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            rfView = h2o_cmd.runRF(csvPathname=csvPathname, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix']['classification_error']
            self.assertLess(classification_error, 0.02, 
                "train.csv should have full classification error <0.02")

            print "Trial #", trial, "completed"
Example #54
0
 def test_G_RF_covtype(self):
     h2o_cmd.runRF(
         trees=6,
         timeoutSecs=35,
         retryDelaySecs=0.5,
         csvPathname=h2o.find_dataset('UCI/UCI-large/covtype/covtype.data'))
Example #55
0
    def test_loop_random_exec_covtype(self):
        lenNodes = len(h2o.nodes)
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        key2 = 'c.hex'
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', key2,
                                     10)
        print "\nParse key is:", parseKey['destination_key']

        h2b.browseTheCloud()
        # for trial in range(53):
        trial = 0
        while (trial < 100):
            for exprTemplate in exprList:
                trial = trial + 1
                n = trial
                colX = random.randint(1, 54)
                row = random.randint(1, 400000)

                execExpr = exprTemplate
                execExpr = re.sub('<col1>', str(colX), execExpr)
                execExpr = re.sub('<col2>', str(colX + 1), execExpr)
                execExpr = re.sub('<n>', str(n), execExpr)
                execExpr = re.sub('<row>', str(row), execExpr)
                execExpr = re.sub('<keyX>', str(key2), execExpr)

                # pick a random node to execute it on
                randNode = random.randint(0, lenNodes - 1)
                print "\nexecExpr:", execExpr, "on node", randNode

                start = time.time()
                resultExec = h2o_cmd.runExecOnly(node=h2o.nodes[randNode],
                                                 expression=execExpr,
                                                 timeoutSecs=15)
                h2o.verboseprint(h2o.dump_json(resultExec))
                # print(h2o.dump_json(resultExec))

                # FIX! race conditions. If json is done, does that mean you can inspect it??
                # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist
                if trial > 1:
                    inspectMe = random.choice(inspectList)
                    resultInspect = h2o.nodes[0].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[1].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                    resultInspect = h2o.nodes[2].inspect(inspectMe)
                    h2o.verboseprint(h2o.dump_json(resultInspect))

                # FIX! if we race the browser doing the exec too..it shouldn't be a problem?
                # might be a bug?

                # WARNING! we can't browse the Exec url history, since that will
                # cause the Exec to execute again thru the browser..i.e. it has side effects
                # just look at the last inspect, which should be the resultInspect!
                # h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                h2b.browseJsonHistoryAsUrlLastMatch("Exec")
                # url = "http://192.168.0.37:54321/Exec?Expr=Result3+%3D+c.hex%5B26%5D+%2B+Result1&Key=Result"
                # webbrowser.open_new_tab(url)

                # FIX! I suppose we have the problem of stdout/stderr not having flushed?
                # should hook in some way of flushing the remote node stdout/stderr
                h2o.check_sandbox_for_errors()
                print "exec end on ", "covtype.data", 'took', time.time(
                ) - start, 'seconds'
                print "Trial #", trial, "completed\n"