Example #1
0
    def test_GLM_poisson_1(self):
        csvFilename = 'covtype.data'
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        if (1 == 0):
            print "WARNING: just doing the first 33 features, for comparison to ??? numbers"
            # pythonic!
            x = ",".join(map(str, range(33)))
        else:
            x = ""

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'x': x,
            'y': y,
            'family': 'poisson',
            'link': 'log',
            'n_folds': 0,
            'max_iter': max_iter,
            'beta_epsilon': 1e-3
        }

        timeoutSecs = 120
        # L2
        start = time.time()
        kwargs.update({'alpha': 0, 'lambda': 0})
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
        print "glm (L2) end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

        # Elastic
        kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
        print "glm (Elastic) end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

        # L1
        kwargs.update({'alpha': 1, 'lambda': 1e-4})
        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
        print "glm (L1) end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
Example #2
0
    def test_tnc3_ignore(self):
        csvFilename = 'tnc3_10.csv'
        csvPathname = h2o.find_file('smalldata/' + csvFilename)
        print "\n" + csvPathname
        key2 = "tnc3.hex"
        h2b.browseTheCloud()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10)
        print "Parse result['Key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if (1==0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'y': 13, 'num_cross_validation_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseKey
            # in any case, the destination_key in parseKey was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'


        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if (1==0):
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'y': 13, 'num_cross_validation_folds': 6}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Example #3
0
    def test_GLM_covtype(self):
        csvFilename = 'covtype.data'
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        if (1==0):
            print "WARNING: just doing the first 33 features, for comparison to allstate numbers"
            # pythonic!
            x = ",".join(map(str,range(33)))
        else:
            x = ""

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"

        # L2 
        args = {
            'x': x,
            'y': y,
            'family': 'binomial',
            'link': 'logit',
            'num_cross_validation_folds': 0,
            'case_mode': '=',
            'case': 1,
            'max_iter': max_iter,
            'beta_eps': 1e-3}

        timeoutSecs = 120

        start = time.time()
        kwargs.update({'alpha': 0, 'lambda': 0})
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

        # Elastic
        kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

        # L1
        kwargs.update({'alpha': 1, 'lambda': 1e-4})
        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
Example #4
0
    def test_C_prostate(self):
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        x = ""
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for appendx in xrange(9):
            if (appendx == 0):
                print "\n0 is member ID. not used"
            elif (appendx == 1):
                print "\n1 is output."
            else:
                if x == "": 
                    x = str(appendx)
                else:
                    x = x + "," + str(appendx)

                sys.stdout.write('.')
                sys.stdout.flush() 
                print "\nx:", x
                print "y:", y

                kwargs = {'x': x, 'y':  y, 'num_cross_validation_folds': 5}
                glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2, **kwargs)
                # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
                h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
Example #5
0
    def test_B_benign(self):
        print "\nStarting benign.csv"
        csvFilename = "benign.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        y = "3"
        x = ""
        # cols 0-13. 3 is output
        # no member id in this one
        for appendx in xrange(14):
            if (appendx == 3): 
                print "\n3 is output."
            else:
                if x == "": 
                    x = str(appendx)
                else:
                    x = x + "," + str(appendx)

                csvFilename = "benign.csv"
                csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
                print "\nx:", x
                print "y:", y
                
                kwargs = {'x': x, 'y':  y}
                # fails with num_cross_validation_folds
                print "Not doing num_cross_validation_folds with benign. Fails with 'unable to solve?'"
                glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
                # no longer look at STR?
                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
    def test_GLM_model_key_unique(self):
        modelKeyDict = {}
        for trial in range (1,5):
            csvPathname = 'iris/iris2.csv'
            start = time.time()
            # h2o.py now sets destination_key for a fixed default model name, 
            # we want h2o to create model names for this test, so use none here
            kwargs = {'destination_key': None, 'y':4, 'family': 'binomial', 'case': 1, 'case_mode': '>'}
            
            # make sure each parse is unique dest key (not in use
            hex_key = "iris2_" + str(trial) + ".hex"
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put',
                timeoutSecs=10, noPoll=True, **kwargs )
            glmResult = h2o_cmd.runGLMOnly(parseResult=parseResutl, timeoutSecs=10, noPoll=True, **kwargs )
            print "GLM #%d" % trial,  "started on ", csvPathname, 'took', time.time() - start, 'seconds'

            model_key = glmResult['destination_key']
            print "GLM model_key:", model_key
            if model_key in modelKeyDict:
                raise Exception("same model_key used in GLM #%d that matches prior GLM #%d" % (trial, modelKeyDict[model_key]))
            modelKeyDict[model_key] = trial

        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
    def test_GLM_params_rand2_8977501266014959103(self):
        # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        # SEED = random.randint(0, sys.maxint)
        SEED = 8977501266014959103
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'alpha': 0,
                'lambda': 0,
                'case': 1,
                'n_folds': 1
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=70,
                                     parseKey=parseKey,
                                     **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "Trial #", trial, "completed\n"
Example #8
0
 def test_GLM_syn_2659x1049x2enum(self):
     csvFilename = "syn_2659x1049x2enum.csv"
     csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
     kwargs = params
     glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=240, **kwargs)
     h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Example #9
0
    def test_C_prostate(self):
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,9):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            # solver can be ADMM. standardize normalizes the data.
            kwargs = {'x': x, 'y':  y, 'n_folds': 5,\
                'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            h2o.check_sandbox_for_errors()
            sys.stdout.write('.')
            sys.stdout.flush() 
    def test_GLM_params_rand2_4082088627997819015(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': 'binomial',
                'max_iter': 5,
                'case': 1,
                'alpha': 0,
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            timeoutSecs = max(150,
                              params['n_folds'] * 10 + params['max_iter'] * 10)
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs,
                                     parseKey=parseKey,
                                     **kwargs)
            elapsed = time.time() - start
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()

            print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Trial #", trial, "completed\n"
Example #11
0
    def test_C_prostate(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        x = ""
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,6):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y, 'n_folds': 5}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            sys.stdout.write('.')
            sys.stdout.flush() 

        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
Example #12
0
    def test_B_benign(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()

        print "\nStarting benign.csv"
        csvFilename = "benign.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        y = "3"
        # cols 0-13. 3 is output
        # no member id in this one
        for maxx in range(11,14):
            x = range(maxx)
            x.remove(3) # 3 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            # no longer look at STR?
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            sys.stdout.write('.')
            sys.stdout.flush() 
Example #13
0
    def test_glm_covtype_single_cols(self):
        timeoutSecs = 10
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        print "\n" + csvPathname

        # columns start at 0
        y = "54"
        x = ""
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        print "GLM binomial wth 1 X column at a time"
        print "Result check: abs. value of coefficient and intercept returned are bigger than zero"
        for colX in xrange(54):
            if x == "":
                x = str(colX)
            else:
                # x = x + "," + str(colX)
                x = str(colX)

            sys.stdout.write('.')
            sys.stdout.flush()
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
Example #14
0
    def test_B_importFolder_files(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 1500

        csvFilenameAll = [
            # quick test first
            "covtype.data",
            # then the real thing
            "billion_rows.csv.gz",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=timeoutSecs,
                                                 pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            kwargs = {
                'x': 0,
                'y': 1,
                'n_folds': 0,
                'case_mode': '=',
                'case': 1
            }
            # one coefficient is checked a little more
            colX = 0

            # L2
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)

            sys.stdout.write('\n.')
            sys.stdout.flush()
Example #15
0
 def test_GLM_syn_2659x1049x2enum(self):
     csvFilename = "syn_2659x1049x2enum.csv"
     csvPathname = 'logreg' + '/' + csvFilename
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
     kwargs = params
     glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=240, **kwargs)
     h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Example #16
0
 def test_B_benign(self):
     print "\nStarting benign.csv"
     csvFilename = "benign.csv"
     csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
     # columns start at 0
     y = "3"
     # cols 0-13. 3 is output
     # no member id in this one
     for maxx in range(4,14):
         x = range(maxx)
         x.remove(3) # 3 is output
         x = ",".join(map(str,x))
         print "\nx:", x
         print "y:", y
         
         # solver can be ADMM
         kwargs = {'x': x, 'y':  y,\
              'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1}
         # fails with n_folds
         print "Not doing n_folds with benign. Fails with 'unable to solve?'"
         glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
         # no longer look at STR?
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         sys.stdout.write('.')
         sys.stdout.flush() 
Example #17
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30)
    y = "10"
    x = ""
    # Took n_folds out, because GLM doesn't include n_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values 1,-1. need to specify case
    kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}

    start = time.time()
    glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
    print "GLM in",  (time.time() - start), "secs (python)"
    h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glm['GLMModel']
    validationsList = glm['GLMModel']['validations']
    validations = validationsList[0]
    # validations['err']

    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'covtype.data',
                ]
        else:
            csvFilenameList = [
                'covtype200x.data',
                'covtype200x.data',
                'covtype.data',
                'covtype.data',
                'covtype20x.data',
                'covtype20x.data',
                ]

        # a browser window too, just because we can
        ## h2b.browseTheCloud()
        importFolderPath = "standard"
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Example #19
0
    def test_GLM_princeton(self):
        # filename, y, timeoutSecs
        # these are all counts? using gaussian?
        csvFilenameList = [
            ('cuse.dat', 'gaussian', 3, 5), # notUsing
            ('cuse.dat', 'gaussian', 4, 5), # using
            ('copen.dat', 'gaussian', 4, 5),
            ('housing.raw', 'gaussian', 4, 5),
            ]

        trial = 0
        for (csvFilename, family, y, timeoutSecs) in csvFilenameList:
            csvPathname1 = 'logreg/princeton/' + csvFilename
            fullPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True)
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_stripped.csv'
            h2o_util.file_strip_trailing_spaces(fullPathname1, csvPathname2)

            kwargs = {'n_folds': 0, 'family': family, 'link': 'familyDefault', 'y': y}
            parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=timeoutSecs, **kwargs)
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end (w/check) on ", csvPathname2, 'took', time.time() - start, 'seconds'
            trial += 1
            print "\nTrial #", trial
Example #20
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # build up the parameter string in X
        y = "106"
        x = ""

        # go right to the big X and iterate on that case
        ### for trial in range(2):
        for trial in range(2):
            print "\nTrial #", trial, "start"
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'y': y}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            h2o.check_sandbox_for_errors()
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")
            print "\nTrial #", trial
Example #21
0
    def test_GLM_big1_nopoll(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname

        y = "106"
        x = ""
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        glmInitial = []
        # dispatch multiple jobs back to back
        start = time.time()
        for jobDispatch in range(40):
            kwargs = {'x': x, 'y': y, 'n_folds': 1}
            # FIX! what model keys do these get?
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, noPoll=True, **kwargs)
            glmInitial.append(glm)
            print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            timeoutSecs = 200
        h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=timeoutSecs, retryDelaySecs=10)
        elapsed = time.time() - start
        print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for glm in glmInitial:
            print "Checking completed job, with no polling:", glm
            a = h2o.nodes[0].poll_url(glm['response'], noPoll=True)
            h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
Example #22
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname

        y = "106"
        x = ""
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        for trial in xrange(3):
            sys.stdout.write('.')
            sys.stdout.flush()
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'x': x, 'y': y, 'n_folds': 6}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=300,
                                     **kwargs)

            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "\nTrial #", trial
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = ["covtype.data"]
        else:
            csvFilenameList = [
                "covtype200x.data",
                "covtype200x.data",
                "covtype.data",
                "covtype.data",
                "covtype20x.data",
                "covtype20x.data",
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = "/home/0xdiag/datasets/standard"
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60)
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey["destination_key"])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm["GLMModel"]
            coefficients = GLMModel["coefficients"]
            validationsList = GLMModel["validations"]
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, "err", validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write(".")
            sys.stdout.flush()
Example #24
0
    def test_C_prostate(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        x = ""
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,6):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y, 'n_folds': 5}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            sys.stdout.write('.')
            sys.stdout.flush() 

        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
Example #25
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # build up the parameter string in X
        y = "106"
        x = ""

        # go right to the big X and iterate on that case
        ### for trial in range(2):
        for trial in range(2):
            print "\nTrial #", trial, "start"
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'y': y}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=200,
                                     **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            h2o.check_sandbox_for_errors()
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")
            print "\nTrial #", trial
Example #26
0
    def test_B_benign(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()

        print "\nStarting benign.csv"
        csvFilename = "benign.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        y = "3"
        # cols 0-13. 3 is output
        # no member id in this one
        for maxx in range(11,14):
            x = range(maxx)
            x.remove(3) # 3 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y}
            # fails with n_folds
            print "Not doing n_folds with benign. Fails with 'unable to solve?'"
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            # no longer look at STR?
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            sys.stdout.write('.')
            sys.stdout.flush() 
Example #27
0
 def test_GLM_params_rand2_newargs(self):
     # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
     key = 'covtype.20k'
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key)
     paramDict = define_params()
     for trial in range(20):
         # params is mutable. This is default.
         params = {
             'y': 54,
             'case': 1,
             'lambda': 0,
             'alpha': 0,
             'n_folds': 1
         }
         colX = h2o_glm.pickRandGlmParams(paramDict, params)
         kwargs = params.copy()
         start = time.time()
         glm = h2o_cmd.runGLMOnly(timeoutSecs=70,
                                  parseKey=parseKey,
                                  **kwargs)
         # pass the kwargs with all the params, so we know what we asked for!
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         print "glm end on ", csvPathname, 'took', time.time(
         ) - start, 'seconds'
         print "Trial #", trial, "completed\n"
Example #28
0
 def test_B_benign(self):
     print "\nStarting benign.csv"
     csvFilename = "benign.csv"
     csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
     # columns start at 0
     y = "3"
     # cols 0-13. 3 is output
     # no member id in this one
     for maxx in range(4,14):
         x = range(maxx)
         x.remove(3) # 3 is output
         x = ",".join(map(str,x))
         print "\nx:", x
         print "y:", y
         
         # solver can be ADMM
         kwargs = {'x': x, 'y':  y,\
              'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1}
         # fails with n_folds
         print "Not doing n_folds with benign. Fails with 'unable to solve?'"
         glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
         # no longer look at STR?
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         sys.stdout.write('.')
         sys.stdout.flush() 
Example #29
0
    def test_GLM_gaussian_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': "gamma",
                'alpha': 0.5,
                'lambda': 1e-4,
                'max_iter': 10
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120,
                                     parseKey=parseKey,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Example #30
0
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                 key2=csvFilename + ".hex",
                                 timeoutSecs=10)
    y = "10"
    x = ""
    # Took n_folds out, because GLM doesn't include n_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values, 1 and -1. need to use case for one of them
    kwargs = {'x': x, 'y': y, 'case': -1}

    start = time.time()
    glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                             timeoutSecs=timeoutSecs,
                             **kwargs)
    print "GLM in", (time.time() - start), "secs (python measured)"
    h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glm['GLMModel']
    validationsList = glm['GLMModel']['validations']
    validations = validationsList[0]
    # validations['err']

    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
    def test_GLM_params_rand2_4082088627997819015(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54, 
                'n_folds' : 3, 
                'family' : 'binomial', 
                'max_iter' : 5, 
                'case': 1, 
                'alpha': 0, 
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10)
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            elapsed = time.time() - start
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()
            
            print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Trial #", trial, "completed\n"
Example #32
0
    def test_many_cols_with_syn(self):
        ### h2b.browseTheCloud()

        csvFilename = "logreg_trisum_int_cat_10000x10.csv"
        csvPathname = "smalldata/logreg/" + csvFilename
        key2 = csvFilename + ".hex"

        parseKey = h2o_cmd.parseFile(None, h2o.find_file(csvPathname), key2=key2, timeoutSecs=10)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", parseKey['destination_key']

        # We should be able to see the parse result?
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvFilename

        paramDict = define_params()
        paramDict2 = {}
        for k in paramDict:
            # sometimes we have a list to pick from in the value. now it's just list of 1.
            paramDict2[k] = paramDict[k][0]

        y = 10
        # FIX! what should we have for case? 1 should be okay because we have 1's in output col
        kwargs = {'y': y, 'max_iter': 50}
        kwargs.update(paramDict2)

        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=20, **kwargs)
        print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs)

        if not h2o.browse_disable:
            h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            time.sleep(5)
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54, 
                'num_cross_validation_folds' : 3, 
                'family' : 'binomial', 
                'max_iter' : 5, 
                'case': 1, 
                'alpha': 0, 
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=150, parseKey=parseKey, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Example #34
0
    def test_GLM_gaussian_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'num_cross_validation_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds'

            print "Trial #", trial, "completed\n"
Example #35
0
    def test_C_prostate(self):
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,9):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            # solver can be ADMM. standardize normalizes the data.
            kwargs = {'x': x, 'y':  y, 'n_folds': 5,\
                'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            h2o.check_sandbox_for_errors()
            sys.stdout.write('.')
            sys.stdout.flush() 
    def test_GLM_params_rand2(self):
        # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k")

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_glm_covtype_single_cols(self):
        timeoutSecs = 10
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        print "\n" + csvPathname

        # columns start at 0
        y = "54"
        x = ""
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        print "GLM binomial wth 1 X column at a time" 
        print "Result check: abs. value of coefficient and intercept returned are bigger than zero"
        for colX in xrange(54):
            if x == "": 
                x = str(colX)
            else:
                # x = x + "," + str(colX)
                x = str(colX)

            sys.stdout.write('.')
            sys.stdout.flush() 
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
Example #38
0
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10)
    y = "10"
    x = ""
    # Took num_cross_validation_folds out, because GLM doesn't include num_cross_validation_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values, 1 and -1. need to use case for one of them
    kwargs = {'x': x, 'y':  y, 'case': -1}

    start = time.time()
    glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
    print "GLM in",  (time.time() - start), "secs (python measured)"
    h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glm['GLMModel']
    validationsList = glm['GLMModel']['validations']
    validations = validationsList[0]
    # validations['err']

    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
Example #39
0
    def test_C_prostate_w_predict(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        x = ""
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,6):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y, 'n_folds': 5}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']
            print "Doing predict with same dataset, and the GLM model"
            h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key'])

        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
Example #40
0
    def test_B_benign_w_predict(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()

        print "\nStarting benign.csv"
        csvFilename = "benign.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        y = "3"
        # cols 0-13. 3 is output
        # no member id in this one
        for maxx in range(11,14):
            x = range(maxx)
            x.remove(3) # 3 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y}
            # fails with n_folds
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']
            print "Doing predict with same dataset, and the GLM model"
            h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key'])
Example #41
0
    def test_GLM_gamma_rand2(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'y': 54,
                'n_folds': 3,
                'family': "gamma",
                'alpha': 0.5,
                'lambda': 1e-4,
                'max_iter': 24
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=120,
                                     parseKey=parseKey,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Example #42
0
    def test_B_benign(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()

        print "\nStarting benign.csv"
        csvFilename = "benign.csv"
        csvPathname = 'logreg' + '/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        # columns start at 0
        y = "3"
        # cols 0-13. 3 is output
        # no member id in this one
        for maxx in range(11,14):
            x = range(maxx)
            x.remove(3) # 3 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y}
            glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=15, **kwargs)
            # no longer look at STR?
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            sys.stdout.write('.')
            sys.stdout.flush() 
Example #43
0
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 54, 
                'n_folds': 3, 
                'family': "poisson", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'beta_epsilon': 0.001, 
                'max_iter': 15,
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['n_folds']*20)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
    def test_GLM_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000,  10, 'cA.hex', 100),
            (10000,  20, 'cB.hex', 200),
            (10000,  30, 'cC.hex', 300),
            (10000,  40, 'cD.hex', 400),
            (10000,  50, 'cE.hex', 500),
            ]

        ### h2b.browseTheCloud()

        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
                '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))',
                ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])',
            ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, 
                timeoutSecs=90, incrementingResult=False)
            print "\nexec colResultList", colResultList

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]
            # since we add the output twice, it's no longer colCount-1
            y = colCount
            kwargs = {'y': y, 'max_iter': 50, 'case': 1}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            # only col y-1 (next to last)doesn't get renamed in coefficients 
            # due to enum/categorical expansion
            print "y:", y 
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(3)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
    def test_GLM_with_logit_result_1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 5, 'cA', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, \
                "using random coefficients and intercept and logit eqn. for output"
            (coefficients,
             intercept) = gen_rand_equation(colCount, SEEDPERFILE)
            print coefficients, intercept
            write_syn_dataset(csvPathname, rowCount, colCount, coefficients,
                              intercept, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 60,
                'lambda': 1e-4,
                'alpha': 0,
                'weight': 1.0,
                'n_folds': 3,
                'beta_epsilon': 1e-4,
                'thresholds': 0.5,
            }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            (warnings, coefficients,
             intercept) = h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(5)
Example #46
0
    def test_many_cols_real(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 100, 'cA', 300),
            (1000, 200, 'cB', 300),
            (1000, 300, 'cC', 300),
            (1000, 400, 'cD', 300),
            (1000, 500, 'cE', 300),
            (1000, 1000, 'cJ', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 50,
                'case': '1',
                'case_mode': '=',
                'lambda': 1e-4,
                'alpha': 0.6
            }
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)

            # try new offset/view
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         offset=100,
                                         view=100)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'YearPredictionMSD.txt'
                ]
        else:
            csvFilenameList = [
                'YearPredictionMSD.txt'
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)

            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Example #48
0
    def test_GLM_hdfs_YearPredictionMSD(self):
        if localhost:
            csvFilenameList = [
                'YearPredictionMSD.txt',
                'YearPredictionMSD.txt'
                ]
        else:
            csvFilenameList = [
                'YearPredictionMSD.txt',
                'YearPredictionMSD.txt'
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            h2i.setupImportHdfs()
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=60)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=500, **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Example #49
0
    def test_categorical_expand_and_probability_output(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]
        tryList = [
            (7919, 53, 'cA', 600),
            # translated to enums, 4 per col, so don't go above 2k effective cols or too slow!
            (2659, 400, 'cB', 600),
        ]

        ### h2b.browseTheCloud()

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              translateList)

            print "\nUpload and parse", csvPathname
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=240,
                                         retryDelaySecs=0.5)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]

            y = colCount
            kwargs = {'y': y, 'max_iter': 20}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            # only col Y-1 (next to last)doesn't get renamed in coefficients due to enum/categorical expansion
            print "y:", y
            # FIX! bug was dropped coefficients if constant column is dropped
            ### h2o_glm.simpleCheckGLM(self, glm, Y-2, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Example #50
0
    def test_short(self):
            csvFilename = 'part-00000b'
            ### csvFilename = 'short'
            importFolderPath = '/home/hduser/data'
            importFolderResult = h2i.setupImportFolder(None, importFolderPath)
            csvPathname = importFolderPath + "/" + csvFilename

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            start = time.time()
            # hardwire TAB as a separator, as opposed to white space (9)
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=500, separator=9)
            print "Parse of", parseKey['destination_key'], "took", time.time() - start, "seconds"

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=500)
            print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            # num_rows = inspect['num_rows']
            # num_cols = inspect['num_cols']

            keepPattern = "oly_|mt_|b_"
            y = "is_purchase"
            print "y:", y
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseKey['destination_key'], timeoutSecs=300)
            print "x:", x

            kwargs = {
                'x': x, 
                'y': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 100,
                'beta_epsilon': 1.0E-4,
                }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
            elapsed = time.time() - start
            print "glm completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Example #51
0
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(5):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 5,
                'n_folds': 1,
                'family': "poisson",
                'alpha': 0.0,
                'lambda': 0,
                'beta_epsilon': 0.001,
                'max_iter': 3,
                'standardize': 1,
                'expert': 1,
                'lsm_solver': 'GenGradient',
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds'] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs,
                                     parseKey=parseKey,
                                     **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "Trial #", trial, "completed\n"
Example #52
0
    def test_GLM_many_cols_tridist(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000,  10, 'cA', 300),
            (10000,  20, 'cB', 300),
            (10000,  30, 'cC', 300),
            (10000,  40, 'cD', 300),
            (10000,  50, 'cE', 300),
            (10000,  60, 'cF', 300),
            (10000,  70, 'cG', 300),
            (10000,  80, 'cH', 300),
            (10000,  90, 'cI', 300),
            (10000, 100, 'cJ', 300),
            (10000, 200, 'cK', 300),
            (10000, 300, 'cL', 300),
            (10000, 400, 'cM', 300),
            (10000, 500, 'cN', 300),
            (10000, 600, 'cO', 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "\nParse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]

            y = colCount
            kwargs = {'y': y}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Example #53
0
    def test_GLM_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']
        tryList = [
            (10000,  100, 'cA', 100),
            (10000,  200, 'cB', 200),
            (10000,  300, 'cC', 300),
            ]

        ### h2b.browseTheCloud()

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'y': y, 
                'max_iter': 50, 
                'case': 1,
                'family': 'binomial',
                'lambda': 0,
                'alpha': 0,
                'max_iter': 50,
                'weight': 1.0,
                'thresholds': 0.5,
                'n_folds': 2,
                'beta_epsilon':1.0E-4,
            }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "y:", y 
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(10)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(10)
Example #54
0
    def test_GLM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if localhost:
            tryList = [
                (10000, 100, 'cA', 300), 
                (10000, 1000, 'cB', 300), 
                (10000, 3000, 'cC', 500), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (10000, 100, 'cD', 300), 
                (10000, 200, 'cE', 300), 
                (10000, 300, 'cF', 300), 
                (10000, 400, 'cG', 300), 
                (10000, 500, 'cH', 300), 
                (10000, 1000, 'cI', 300), 
                ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            # normally we dno't create x and rely on the default
            # create the big concat'ed x like the browser, to see what happens
            x = ','.join(map(str, range(colCount)))
            kwargs = {'x': x, 'y': y, 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)