def test_GLM_params_rand2_newargs(self):
        csvPathname = 'covtype/covtype.20k.data'
        hex_key = 'covtype.20k.hex'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put')
        paramDict = define_params()

        y = 54
        print "Want to see if there are constant columns"
        goodX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
        print "goodX:", goodX

        # intermittent fail on the forced params?
        for trial in range(10 if DO_FAIL_ONLY else 20):
            if DO_FAIL_ONLY:
                params = define_params_fail()
            else:
                # params is mutable. This is default.
                params = {'y': y, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1}
                h2o_glm.pickRandGlmParams(paramDict, params)

            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_GLM_params_rand2_newargs(self):
        csvPathname = 'covtype/covtype.20k.data'
        hex_key = 'covtype.20k.hex'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put')
        paramDict = define_params()

        y = 54
        print "Want to see if there are constant columns"
        goodX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
        print "goodX:", goodX

        # intermittent fail on the forced params?
        for trial in range(10 if DO_FAIL_ONLY else 20):
            if DO_FAIL_ONLY:
                params = define_params_fail()
            else:
                # params is mutable. This is default.
                params = {'y': y, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1}
                h2o_glm.pickRandGlmParams(paramDict, params)

            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
 def test_GLM_params_rand2_newargs(self):
     csvPathname = 'covtype/covtype.20k.data'
     hex_key = 'covtype.20k.hex'
     parseResult = h2i.import_parse(bucket='smalldata',
                                    path=csvPathname,
                                    hex_key=hex_key,
                                    schema='put')
     paramDict = define_params()
     for trial in range(20):
         # params is mutable. This is default.
         params = {
             'y': 54,
             'case': 1,
             'lambda': 0,
             'alpha': 0,
             'n_folds': 1
         }
         colX = h2o_glm.pickRandGlmParams(paramDict, params)
         kwargs = params.copy()
         start = time.time()
         glm = h2o_cmd.runGLM(timeoutSecs=70,
                              parseResult=parseResult,
                              **kwargs)
         # pass the kwargs with all the params, so we know what we asked for!
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         print "glm end on ", csvPathname, 'took', time.time(
         ) - start, 'seconds'
         print "Trial #", trial, "completed\n"
    def test_GLM2_poisson_rand2(self):
        csvPathname = "standard/covtype.data"
        parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, schema="put")
        paramDict = define_params()
        for trial in range(20):
            params = {
                "response": 54,
                "n_folds": 3,
                "family": "poisson",
                "alpha": 0.5,
                "lambda": 1e-4,
                "beta_epsilon": 0.001,
                "max_iter": 15,
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs["n_folds"] * 40)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs["max_iter"] + 1)))

            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs)
            print "glm end on ", csvPathname, "took", time.time() - start, "seconds"

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_NOPASS_GLM2_tweedie_rand2(self):
        h2o.beta_features = True
        if 1==1:
            csvPathname = 'standard/covtype.data'
            hex_key = 'covtype.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='put')
        else:
            csvPathname = 'covtype/covtype.20k.data'
            hex_key = 'covtype.20k.hex'
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put')


        paramDict = define_params()

        for trial in range(10):
            # params is mutable. This is default.
            params = {
                'response': 54, 
                'lambda': 0, 
                'alpha': 0, 
                'n_folds': 1, 
                'family': 'tweedie'
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Beispiel #6
0
    def test_GLM_gaussian_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': "gamma",
                'alpha': 0.5,
                'lambda': 1e-4,
                'max_iter': 10
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120,
                                     parseKey=parseKey,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Beispiel #7
0
    def test_GLM2_params_rand2(self):
        csvPathname = 'covtype/covtype.20k.data'

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k")

        CLASS = 1
        # make a binomial version 
        execExpr="B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ('covtype.20k', 54+1, 54+1, CLASS)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'response': 54, 
                'alpha': 0.1, 
                # 'lambda': 1e-4, 
                'lambda': 0,
                'n_folds': 1,
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            if 'family' not in kwargs or kwargs['family']=='binomial':
                bHack = {'destination_key': 'B.hex'}
            else:
                bHack = parseResult
            
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54, 
                'num_cross_validation_folds' : 3, 
                'family' : 'binomial', 
                'max_iter' : 5, 
                'case': 1, 
                'alpha': 0, 
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=150, parseKey=parseKey, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_GLM_params_rand2_4082088627997819015(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54, 
                'n_folds' : 3, 
                'family' : 'binomial', 
                'max_iter' : 5, 
                'case': 1, 
                'alpha': 0, 
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10)
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            elapsed = time.time() - start
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()
            
            print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Trial #", trial, "completed\n"
    def test_GLM_params_rand2_8977501266014959103(self):
        # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        # SEED = random.randint(0, sys.maxint)
        SEED = 8977501266014959103
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'alpha': 0,
                'lambda': 0,
                'case': 1,
                'n_folds': 1
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=70,
                                     parseKey=parseKey,
                                     **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "Trial #", trial, "completed\n"
Beispiel #11
0
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 54, 
                'n_folds': 3, 
                'family': "poisson", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'beta_epsilon': 0.001, 
                'max_iter': 15,
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['n_folds']*20)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Beispiel #12
0
    def test_GLM_tweedie_rand2(self):
        if 1 == 1:
            csvPathname = "standard/covtype.data"
            hex_key = "covtype.hex"
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, schema="put"
            )
        else:
            csvPathname = "covtype/covtype.20k.data"
            hex_key = "covtype.20k.hex"
            parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=hex_key, schema="put")

        paramDict = define_params()

        for trial in range(10):
            # params is mutable. This is default.
            params = {"y": 54, "case": 4, "case_mode": "=", "lambda": 0, "alpha": 0, "n_folds": 1, "family": "tweedie"}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, "took", time.time() - start, "seconds"
            print "Trial #", trial, "completed\n"
    def test_GLM_params_rand2(self):
        # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k")

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Beispiel #14
0
 def test_GLM_params_rand2_newargs(self):
     # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
     key = 'covtype.20k'
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key)
     paramDict = define_params()
     for trial in range(20):
         # params is mutable. This is default.
         params = {
             'y': 54,
             'case': 1,
             'lambda': 0,
             'alpha': 0,
             'n_folds': 1
         }
         colX = h2o_glm.pickRandGlmParams(paramDict, params)
         kwargs = params.copy()
         start = time.time()
         glm = h2o_cmd.runGLMOnly(timeoutSecs=70,
                                  parseKey=parseKey,
                                  **kwargs)
         # pass the kwargs with all the params, so we know what we asked for!
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         print "glm end on ", csvPathname, 'took', time.time(
         ) - start, 'seconds'
         print "Trial #", trial, "completed\n"
Beispiel #15
0
    def test_GLM2_poisson_rand2(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
        paramDict = define_params()
        for trial in range(20):
            params = {
                'response': 54, 
                'n_folds': 3, 
                'family': "poisson", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'beta_epsilon': 0.001, 
                'max_iter': 15,
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['n_folds']*40)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Beispiel #16
0
    def test_GLM_gaussian_rand2(self):
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put')
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': "gaussian",
                'alpha': 0.5,
                'lambda': 1e-4,
                'max_iter': 30
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=120,
                                 parseResult=parseResult,
                                 **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Beispiel #17
0
    def test_GLM_gamma_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': "gamma",
                'alpha': 0.5,
                'lambda': 1e-4,
                'max_iter': 24
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120,
                                     parseKey=parseKey,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_GLM_gaussian_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'num_cross_validation_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds'

            print "Trial #", trial, "completed\n"
    def test_GLM_params_rand2_4082088627997819015(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': 'binomial',
                'max_iter': 5,
                'case': 1,
                'alpha': 0,
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            timeoutSecs = max(150,
                              params['n_folds'] * 10 + params['max_iter'] * 10)
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs,
                                     parseKey=parseKey,
                                     **kwargs)
            elapsed = time.time() - start
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()

            print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Trial #", trial, "completed\n"
Beispiel #20
0
    def test_GLM_poisson_rand2(self):
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 54, 
                'n_folds': 3, 
                'family': "poisson", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'beta_epsilon': 0.001, 
                'max_iter': 15,
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['n_folds']*40)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Beispiel #21
0
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(5):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 5,
                'n_folds': 1,
                'family': "poisson",
                'alpha': 0.0,
                'lambda': 0,
                'beta_epsilon': 0.001,
                'max_iter': 3,
                'standardize': 1,
                'expert': 1,
                'lsm_solver': 'GenGradient',
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds'] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs,
                                     parseKey=parseKey,
                                     **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "Trial #", trial, "completed\n"
Beispiel #22
0
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv')
        print "\nParsing", csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        # need more info about the dataset for debug
        h2o_cmd.info_from_inspect(inspect, csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 6, 
                'num_cross_validation_folds': 3, 

                'family': "binomial", 
                'case_mode': ['>'],
                'case': ['20'],

                'alpha': 0, 
                'lambda': 0, 
                'beta_epsilon': 0.001, 
                'max_iter': 8
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 120 + (kwargs['num_cross_validation_folds']*30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            elapsed = time.time()-start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_file("smalldata/poisson/Goalies.csv")
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])

        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(5):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                "y": 5,
                "n_folds": 1,
                "family": "poisson",
                "alpha": 0.0,
                "lambda": 0,
                "beta_epsilon": 0.001,
                "max_iter": 3,
                "standardize": 1,
                "expert": 1,
                "lsm_solver": "GenGradient",
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs["n_folds"] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs["max_iter"] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, "took", time.time() - start, "seconds"
            print "Trial #", trial, "completed\n"
    def test_GLM2_binomial_goalies(self):
        h2o.beta_features = True
        csvPathname = 'poisson/Goalies.csv'
        print "\nParsing", csvPathname
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key="A.hex")
        inspect = h2o_cmd.runInspect(None, "A.hex")
        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        case = 20
        execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % (6 + 1, 6 + 1, case)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        paramDict = define_params()
        for trial in range(5):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'response': 6,
                'n_folds': 1,
                'family': "binomial",
                'alpha': 0,
                # seems we always need a little regularization
                'lambda': 1e-4,
                'beta_epsilon': 0.001,
                'max_iter': 8
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds'] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs,
                                 parseResult={'destination_key': 'A.hex'},
                                 **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
 def test_GLM_params_rand2_8977501266014959103(self):
     csvPathname = 'covtype/covtype.20k.data'
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     paramDict = define_params()
     for trial in range(20):
         # params is mutable. This is default.
         params = {'y': 54, 'alpha': 0, 'lambda': 0, 'case': 1, 'n_folds': 1}
         colX = h2o_glm.pickRandGlmParams(paramDict, params)
         kwargs = params.copy()
         start = time.time()
         glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs)
         # pass the kwargs with all the params, so we know what we asked for!
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
         print "Trial #", trial, "completed\n"
    def test_GLM2_gaussian_rand2(self):
        csvPathname = "standard/covtype.data"
        parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, schema="put")
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {"response": 54, "n_folds": 3, "family": "gaussian", "alpha": 0.5, "lambda": 1e-4, "max_iter": 30}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=parseResult, **kwargs)
            print "glm end on ", csvPathname, "took", time.time() - start, "seconds"

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_GLM_gaussian_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'n_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Beispiel #28
0
    def test_GLM_gamma_rand2(self):
        csvPathname = 'UCI/UCI-large/covtype/covtype.data'
        parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put')
        paramDict = define_params()
        for trial in range(10):
            # params is mutable. This is default.
            params = {'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 24}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=300, parseResult=parseResult, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_GLM_poisson_goalies_gg(self):
        csvPathname = 'poisson/Goalies.csv'
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put')
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        paramDict = define_params()
        for trial in range(5):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 5,
                'n_folds': 1,
                'family': "poisson",
                'alpha': 0.0,
                'lambda': 0,
                'beta_epsilon': 0.001,
                'max_iter': 3,
                'standardize': 1,
                'expert_settings': 1,
                'lsm_solver': 'GenGradient',
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds'] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs,
                                 parseResult=parseResult,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "Trial #", trial, "completed\n"
 def test_GLM_params_rand2_8977501266014959103(self):
     # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
     paramDict = define_params()
     for trial in range(20):
         # params is mutable. This is default.
         params = {'y': 54, 'alpha': 0, 'lambda': 0, 'case': 1, 'n_folds': 1}
         colX = h2o_glm.pickRandGlmParams(paramDict, params)
         kwargs = params.copy()
         start = time.time()
         glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs)
         # pass the kwargs with all the params, so we know what we asked for!
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
         print "Trial #", trial, "completed\n"
    def test_GLM2_binomial_goalies(self):
        h2o.beta_features = True
        csvPathname = 'poisson/Goalies.csv'
        print "\nParsing", csvPathname
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="A.hex")
        inspect = h2o_cmd.runInspect(None, "A.hex")
        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        case = 20
        execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % (6+1, 6+1, case)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)


        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'response': 6, 
                'n_folds': 1, 
                'family': "binomial", 
                'alpha': 0,
                # seems we always need a little regularization
                'lambda': 1e-4,
                'beta_epsilon': 0.001, 
                'max_iter': 8
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds']*30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult={'destination_key': 'A.hex'}, **kwargs)
            elapsed = time.time()-start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        # need more info about the dataset for debug
        info_from_inspect(inspect, csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 6, 
                'num_cross_validation_folds': 3, 
                'family': "poisson", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'beta_eps': 0.001, 
                'max_iter': 30
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['num_cross_validation_folds']*20)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Beispiel #33
0
    def test_GLM_binomial_goalies(self):
        csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv')
        print "\nParsing", csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 6,
                'n_folds': 2,
                'family': "binomial",
                'case_mode': '>',
                'case': '20',
                'alpha': 0,
                # seems we always need a little regularization
                'lambda': 1e-4,
                'beta_epsilon': 0.001,
                'max_iter': 8
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds'] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs,
                                     parseKey=parseKey,
                                     **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_GLM_binomial_goalies(self):
        csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv')
        print "\nParsing", csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 6, 
                'n_folds': 2, 

                'family': "binomial", 
                'case_mode': '>',
                'case': '20',

                'alpha': 0, 
                # seems we always need a little regularization
                'lambda': 1e-4,
                'beta_epsilon': 0.001, 
                'max_iter': 8
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds']*30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            elapsed = time.time()-start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Beispiel #35
0
    def test_GLM_binomial_goalies(self):
        csvPathname = "poisson/Goalies.csv"
        print "\nParsing", csvPathname
        parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="put")
        inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                "y": 6,
                "n_folds": 1,
                "family": "binomial",
                "case_mode": ">",
                "case": "20",
                "alpha": 0,
                # seems we always need a little regularization
                "lambda": 1e-4,
                "beta_epsilon": 0.001,
                "max_iter": 8,
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs["n_folds"] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs["max_iter"] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_GLM_params_rand2_newargs(self):
        # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        csvPathname = h2o.find_file("smalldata/covtype/covtype.20k.data")
        key = "covtype.20k"
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key)
        paramDict = define_params()

        for trial in range(50):
            # params is mutable. This is default.
            params = {"y": 54, "case": 1, "lambda": 0, "alpha": 0, "n_folds": 1}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, "took", time.time() - start, "seconds"
            print "Trial #", trial, "completed\n"
    def test_GLM_poisson_goalies_gg(self):
        csvPathname = 'poisson/Goalies.csv'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        paramDict = define_params()
        for trial in range(5):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 5,
                'n_folds': 1,
                'family': "poisson",
                'alpha': 0.0,
                'lambda': 0,
                'beta_epsilon': 0.001,
                'max_iter': 3,
                'standardize': 1,
                'expert_settings': 1,
                'lsm_solver': 'GenGradient',
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds']*30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs)
            elapsed = time.time()-start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_GLM2_tweedie_rand2(self):
        h2o.beta_features = True
        if 1 == 1:
            csvPathname = 'standard/covtype.data'
            hex_key = 'covtype.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           schema='put')
        else:
            csvPathname = 'covtype/covtype.20k.data'
            hex_key = 'covtype.20k.hex'
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           schema='put')

        paramDict = define_params()

        for trial in range(10):
            # params is mutable. This is default.
            params = {
                'response': 54,
                'lambda': 0,
                'alpha': 0,
                'n_folds': 1,
                'family': 'tweedie'
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=180,
                                 parseResult=parseResult,
                                 **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "Trial #", trial, "completed\n"
Beispiel #39
0
    def test_GLM2_lambda_search(self):
        h2o.beta_features = True
        csvPathname = "covtype/covtype.20k.data"

        parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="put", hex_key="covtype.20k")

        CLASS = 1
        # make a binomial version
        execExpr = "B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ("covtype.20k", 54 + 1, 54 + 1, CLASS)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        paramDict = define_params()
        for trial in range(20):
            params = {}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            # override choices with these
            params = {
                "response": 54,
                "alpha": 0.1,
                "max_iter": 8,
                # 'lambda': 1e-4,
                # 'lambda': 0,
                "lambda": None,
                "lambda_search": 1,
                "n_folds": 1,
            }
            kwargs = params.copy()

            if "family" not in kwargs or kwargs["family"] == "binomial":
                bHack = {"destination_key": "B.hex"}
            else:
                bHack = parseResult

            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, "took", time.time() - start, "seconds"
            print "Trial #", trial, "completed\n"
Beispiel #40
0
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_dataset("UCI/UCI-large/covtype/covtype.data")
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                "y": 54,
                "n_folds": 3,
                "family": "poisson",
                "alpha": 0.5,
                "lambda": 1e-4,
                "beta_eps": 0.001,
                "max_iter": 30,
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs["n_folds"] * 20)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs["max_iter"] + 1)))

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, "took", time.time() - start, "seconds"

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_GLM_params_rand2(self):
        h2o.beta_features = True
        csvPathname = 'covtype/covtype.20k.data'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k")
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            y = 54
            params = {'response': y, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            case = kwargs.pop('case')
            execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==%s" % ('covtype.20k', y+1, y+1, case)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

            start = time.time()

            glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult={'destination_key': 'aHack'}, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_GLM2_gaussian_rand2(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'response': 54, 
                'n_folds': 3, 
                'family': "gaussian", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'max_iter': 30
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=parseResult, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Beispiel #43
0
    def test_GLM_params_rand2(self):
        importFolderPath = "covtype"
        csvFilename = "covtype.20k.data"
        hex_key = "covtype20k.hex"
        binomial_key = "covtype20k.b.hex"
        b = Key(hex_key)
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        ## columnTypeDict = {54: 'Enum'}
        columnTypeDict = None
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=binomial_key,
                                       columnTypeDict=columnTypeDict,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        Assign(b[:, 54], b[:, 54] - 1)
        # make 1 thru 6 go to 1
        Assign(b[:, 54], b[:, 54] != 0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        numColsUsed = numCols

        paramDict = define_params()
        for trial in range(5):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?

            # params is mutable. This is default.
            parameters = {
                'response_column': 'C55',
                'alpha': 0.1,
                # 'lambda': 1e-4,
                'lambda': 0,
            }
            h2o_glm.pickRandGlmParams(paramDict, parameters)

            if 'family' not in parameters or parameters['family'] == 'binomial':
                bHack = binomial_key
            else:
                bHack = hex_key

            co = h2o_cmd.runSummary(key=binomial_key, column=54)
            print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)
            co = h2o_cmd.runSummary(key=hex_key, column=54)
            print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)

            # fix stupid params
            fixList = [
                'alpha', 'lambda', 'ignored_columns', 'class_sampling_factors'
            ]
            for f in fixList:
                if f in parameters:
                    parameters[f] = "[%s]" % parameters[f]

            model_key = 'rand_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=bHack,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self,
                                   model,
                                   parameters,
                                   labelList,
                                   labelListUsed,
                                   allowNaN=True)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            # FIX! when is this legal
            doClassification = False
            if doClassification:
                mcms = OutputObj(
                    {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
                m1 = mcms.data[1:]
                h0 = mcms.data[0]
                print "\nmcms", tabulate(m1, headers=h0)

            if doClassification:
                thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

                if 1 == 0:
                    print ""
                    for i, c in enumerate(cmms.cm):
                        print "\ncmms.cm[%s]" % i, tabulate(c)
                    print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
    def test_GLM_params_rand2(self):
        importFolderPath = "covtype"
        csvFilename = "covtype.20k.data"
        hex_key = "covtype20k.hex"
        binomial_key = "covtype20k.b.hex"
        b = Key(hex_key)
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key,
            check_header=1, timeoutSecs=180, doSummary=False)

        ## columnTypeDict = {54: 'Enum'}
        columnTypeDict = None
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=binomial_key, 
            columnTypeDict=columnTypeDict,
            check_header=1, timeoutSecs=180, doSummary=False)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        Assign(b[:,54], b[:,54]-1)
        # make 1 thru 6 go to 1
        Assign(b[:,54], b[:,54]!=0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        numColsUsed = numCols

        paramDict = define_params()
        for trial in range(5):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?

            # params is mutable. This is default.
            parameters = {
                'response_column': 'C55',
                'alpha': 0.1,
                # 'lambda': 1e-4, 
                'lambda': 0,
            }
            h2o_glm.pickRandGlmParams(paramDict, parameters)

            if 'family' not in parameters or parameters['family']=='binomial':
                bHack = binomial_key
            else:
                bHack = hex_key

            co = h2o_cmd.runSummary(key=binomial_key, column=54)
            print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum(co.histogram_bins)
            co = h2o_cmd.runSummary(key=hex_key, column=54)
            print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum(co.histogram_bins)

            # fix stupid params
            fixList = ['alpha', 'lambda', 'ignored_columns', 'class_sampling_factors']
            for f in fixList:
                if f in parameters:
                    parameters[f] = "[%s]" % parameters[f]

            model_key = 'rand_glm.hex'
            bmResult = h2o.n0.build_model(
                algo='glm',
                model_id=model_key,
                training_frame=bHack,
                parameters=parameters,
                timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowNaN=True)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            # FIX! when is this legal
            doClassification = False
            if doClassification:
                mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
                m1 = mcms.data[1:]
                h0 = mcms.data[0]
                print "\nmcms", tabulate(m1, headers=h0)

            if doClassification:
                thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

                if 1==0:
                    print ""
                    for i,c in enumerate(cmms.cm):
                        print "\ncmms.cm[%s]" % i, tabulate(c)
                    print ""
                

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')