Beispiel #1
0
    def test_B_benign(self):
        print "\nStarting benign.csv"
        csvFilename = "benign.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        y = "3"
        x = ""
        # cols 0-13. 3 is output
        # no member id in this one
        for appendx in xrange(14):
            if (appendx == 3): 
                print "\n3 is output."
            else:
                if x == "": 
                    x = str(appendx)
                else:
                    x = x + "," + str(appendx)

                csvFilename = "benign.csv"
                csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
                print "\nx:", x
                print "y:", y
                
                kwargs = {'x': x, 'y':  y}
                # fails with num_cross_validation_folds
                print "Not doing num_cross_validation_folds with benign. Fails with 'unable to solve?'"
                glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
                # no longer look at STR?
                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Beispiel #2
0
    def test_R(self):
        rScript = h2o.find_file('R/tests/test_R_GLM_basic.R')
        rLibrary = h2o.find_file('R/H2O_S4.R')
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port)

        (ps, outpath, errpath) =  h2o.spawn_cmd('rtest_with_h2o', shCmdString.split())
        h2o.spawn_wait(ps, outpath, errpath, timeout=10)
Beispiel #3
0
    def test_R_RF_diff_class(self):
        print "\nStarting iris.csv class weight test"
        rScript = h2o.find_file('R/tests/test_R_RF_diff_class.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Compare results from different class weights
        h2o_R.do_R(rScript, rLibrary)
Beispiel #4
0
    def test_R(self):
        rScript = h2o.find_file('R/tests/test_R_GLM_basic.R')
        rLibrary = h2o.find_file('R/H2O_S4.R')
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port)

        (ps, outpath, errpath) =  h2o.spawn_cmd('rtest_with_h2o', shCmdString.split())
        h2o.spawn_wait(ps, outpath, errpath, timeout=10)
Beispiel #5
0
    def test_rf_model_key_unique(self):
        modelKeyDict = {}
        for trial in range(1, 5):
            if trial == 1:
                csvPathname = h2o.find_file('smalldata/iris/iris.csv')
            else:
                csvPathname = h2o.find_file('smalldata/iris/iris2.csv')
            start = time.time()
            rfResult = h2o_cmd.runRF(trees=6,
                                     timeoutSecs=10,
                                     rfView=False,
                                     csvPathname=csvPathname)
            print "RF #%d" % trial, "started on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            model_key = rfResult['model_key']
            print "model_key:", model_key
            if model_key in modelKeyDict:
                raise Exception(
                    "same model_key used in RF #%d that matches prior RF #%d" %
                    (trial, modelKeyDict[model_key]))
            modelKeyDict[model_key] = trial

        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        print "jobs_admin():", h2o.dump_json(a)
Beispiel #6
0
    def test_R_RF_diff_ignore(self):
        print "\nStarting iris.csv ignore predictor(s) test"
        rScript = h2o.find_file('R/tests/test_R_RF_diff_ignore.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Ignore successively more predictor columns
        h2o_R.do_R(rScript, rLibrary)
Beispiel #7
0
    def test_R_C_kmeans_prostate(self):
        print "\nStarting prostate.csv"
        rScript = h2o.find_file('R/tests/test_R_C_kmeans_prostate.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Run k-means with k = 5 on column 2 (Age)
        # Loop to see if we get same centers
        h2o_R.do_R(rScript, rLibrary)
Beispiel #8
0
    def test_R_B_kmeans_benign(self):
        print "\nStarting benign.csv"
        rScript = h2o.find_file('R/tests/test_R_B_kmeans_benign.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Run k-means with k = 3 on all columns
        # Loop to see if we get same centers
        h2o_R.do_R(rScript, rLibrary)
Beispiel #9
0
    def test_R_RF_diff_class(self):
        print "\nStarting iris.csv class weight test"
        rScript = h2o.find_file('R/tests/test_R_RF_diff_class.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Compare results from different class weights
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port)
        
        (ps, outpath, errpath) =  h2o.spawn_cmd('rtest_with_h2o', shCmdString.split())
        rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10)
        if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
Beispiel #10
0
    def test_R_RF_diff_ignore(self):
        print "\nStarting iris.csv ignore predictor(s) test"
        rScript = h2o.find_file('R/tests/test_R_RF_diff_ignore.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Ignore successively more predictor columns
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port)
        
        (ps, outpath, errpath) =  h2o.spawn_cmd('rtest_with_h2o', shCmdString.split())
        rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10)
        if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
Beispiel #11
0
    def test_R_C_kmeans_prostate(self):
        print "\nStarting prostate.csv"
        rScript = h2o.find_file('R/tests/test_R_C_kmeans_prostate.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Run k-means with k = 5 on column 2 (Age)
        # Loop to see if we get same centers
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port)

        (ps, outpath, errpath) =  h2o.spawn_cmd('rtest_with_h2o', shCmdString.split())
        rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10)
        if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
Beispiel #12
0
    def test_R_C_prostate(self):
        print "\nStarting prostate.csv"
        rScript = h2o.find_file('R/tests/test_R_C_prostate.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Columns start at 0
        # Test columns 1-8, with 1 as response
        # (Skip 0 because member ID)
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port)

        (ps, outpath, errpath) =  h2o.spawn_cmd('rtest_with_h2o', shCmdString.split())
        rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10)
        if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
Beispiel #13
0
    def test_R_B_benign(self):
        print "\nStarting benign.csv"
        rScript = h2o.find_file('R/tests/test_R_B_benign.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Columns start at 0
        # Test columns 0-13, with 3 as response
        # N-fold cross-validation = 5
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port)
        
        (ps, outpath, errpath) =  h2o.spawn_cmd('rtest_with_h2o', shCmdString.split())
        rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10)
        if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
Beispiel #14
0
    def test_R_C_kmeans_prostate(self):
        print "\nStarting prostate.csv"
        rScript = h2o.find_file('R/tests/test_R_C_kmeans_prostate.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Run k-means with k = 5 on column 2 (Age)
        # Loop to see if we get same centers
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[
            0].http_addr + ":" + str(h2o.nodes[0].port)

        (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o',
                                               shCmdString.split())
        rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10)
        if (rc != 0):
            raise Exception("R exited with non-zero return code %s" % rc)
Beispiel #15
0
    def test_R_C_prostate(self):
        print "\nStarting prostate.csv"
        rScript = h2o.find_file('R/tests/test_R_C_prostate.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Columns start at 0
        # Test columns 1-8, with 1 as response
        # (Skip 0 because member ID)
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[
            0].http_addr + ":" + str(h2o.nodes[0].port)

        (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o',
                                               shCmdString.split())
        rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10)
        if (rc != 0):
            raise Exception("R exited with non-zero return code %s" % rc)
Beispiel #16
0
    def test_GenParity1(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

        # two row dataset gets this. Avoiding it for now
        # java.lang.ArrayIndexOutOfBoundsException: 1
        # at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange(10, 100, 10):
            shCmdString = "perl " + parityPl + " 128 4 " + str(
                x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange(10, 100, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=trees,
                          timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
Beispiel #17
0
    def test_B_benign(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()

        print "\nStarting benign.csv"
        csvFilename = "benign.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        y = "3"
        # cols 0-13. 3 is output
        # no member id in this one
        for maxx in range(11,14):
            x = range(maxx)
            x.remove(3) # 3 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y}
            # fails with n_folds
            print "Not doing n_folds with benign. Fails with 'unable to solve?'"
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            # no longer look at STR?
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            sys.stdout.write('.')
            sys.stdout.flush() 
Beispiel #18
0
 def test_GLM_params_rand2_newargs(self):
     # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
     csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
     key = 'covtype.20k'
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key)
     paramDict = define_params()
     for trial in range(20):
         # params is mutable. This is default.
         params = {
             'y': 54,
             'case': 1,
             'lambda': 0,
             'alpha': 0,
             'n_folds': 1
         }
         colX = h2o_glm.pickRandGlmParams(paramDict, params)
         kwargs = params.copy()
         start = time.time()
         glm = h2o_cmd.runGLMOnly(timeoutSecs=70,
                                  parseKey=parseKey,
                                  **kwargs)
         # pass the kwargs with all the params, so we know what we asked for!
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         print "glm end on ", csvPathname, 'took', time.time(
         ) - start, 'seconds'
         print "Trial #", trial, "completed\n"
Beispiel #19
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # build up the parameter string in X
        y = "106"
        x = ""

        # go right to the big X and iterate on that case
        ### for trial in range(2):
        for trial in range(2):
            print "\nTrial #", trial, "start"
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'y': y}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=200,
                                     **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            h2o.check_sandbox_for_errors()
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")
            print "\nTrial #", trial
Beispiel #20
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

# two row dataset gets this. Avoiding it for now
# java.lang.ArrayIndexOutOfBoundsException: 1
# at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange (10,100,10):
            shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange (10,100,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
Beispiel #21
0
 def notest_RF_iris2(self):
     trees = 6
     timeoutSecs = 20
     csvPathname = h2o.find_file('smalldata/iris/iris2.csv')
     h2o_cmd.runRF(trees=trees,
                   timeoutSecs=timeoutSecs,
                   csvPathname=csvPathname)
Beispiel #22
0
    def inspect_columns(self,
                        filename,
                        rows=1,
                        cols=26,
                        columnNames=crange('A', 'Z'),
                        columnTypes=None):
        cvsfile = h2o.find_file(filename)
        node = h2o.nodes[0]

        res = h2o_cmd.parseFile(node=node, csvPathname=cvsfile)
        ary = node.inspect(res['destination_key'])

        self.assertEqual(rows, ary['num_rows'])
        self.assertEqual(cols, ary['num_cols'])

        # check column names
        if not columnNames is None:
            for (col, expName) in zip(ary['cols'], columnNames):
                self.assertEqual(expName, col['name'])

        # check column types
        if not columnTypes is None:
            for (col, expType) in zip(ary['cols'], columnTypes):
                self.assertEqual(expType, col['type'])

        return ary
Beispiel #23
0
    def test_many_cols_with_syn(self):
        ### h2b.browseTheCloud()

        csvFilename = "logreg_trisum_int_cat_10000x10.csv"
        csvPathname = "smalldata/logreg/" + csvFilename
        key2 = csvFilename + ".hex"

        parseKey = h2o_cmd.parseFile(None, h2o.find_file(csvPathname), key2=key2, timeoutSecs=10)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", parseKey['destination_key']

        # We should be able to see the parse result?
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvFilename

        paramDict = define_params()
        paramDict2 = {}
        for k in paramDict:
            # sometimes we have a list to pick from in the value. now it's just list of 1.
            paramDict2[k] = paramDict[k][0]

        y = 10
        # FIX! what should we have for case? 1 should be okay because we have 1's in output col
        kwargs = {'y': y, 'max_iter': 50}
        kwargs.update(paramDict2)

        start = time.time()
        glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=20, **kwargs)
        print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs)

        if not h2o.browse_disable:
            h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            time.sleep(5)
Beispiel #24
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = (
                "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR
            )
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 3):
            sys.stdout.write(".")
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30
            )

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3)
            print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds"

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Beispiel #25
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to 
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            # FIX! TBD do we always have to kick off the run from node 0?
            # what if we do another node?
            # FIX! do we need or want a random delay here?
            h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs,
                    csvPathname=csvPathname)
            trees += 10
            sys.stdout.write('.')
            sys.stdout.flush()
Beispiel #26
0
 def notest_RF_poker100(self):
     trees = 6
     timeoutSecs = 20
     csvPathname = h2o.find_file('smalldata/poker/poker100')
     h2o_cmd.runRF(trees=trees,
                   timeoutSecs=timeoutSecs,
                   csvPathname=csvPathname)
Beispiel #27
0
    def test_C_RF_poker100(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10)

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 60
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            trees += 10
Beispiel #28
0
 def test_A_putfile(self):
     cvsfile    = h2o.find_file(file_to_put())
     node       = h2o.nodes[0]
     key        = node.put_file(cvsfile)
     resultSize = node.inspect(key)['value_size_bytes']
     origSize   = h2o.get_file_size(cvsfile)
     self.assertEqual(origSize,resultSize)
Beispiel #29
0
    def test_R_B_benign(self):
        print "\nStarting benign.csv"
        rScript = h2o.find_file('R/tests/test_R_B_benign.R')
        rLibrary = h2o.find_file('R/H2O_Load.R')

        # Columns start at 0
        # Test columns 0-13, with 3 as response
        # N-fold cross-validation = 5
        shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[
            0].http_addr + ":" + str(h2o.nodes[0].port)

        (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o',
                                               shCmdString.split())
        rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10)
        if (rc != 0):
            raise Exception("R exited with non-zero return code %s" % rc)
Beispiel #30
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname

        y = "106"
        x = ""
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        for trial in xrange(3):
            sys.stdout.write('.')
            sys.stdout.flush()
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'x': x, 'y': y, 'n_folds': 6}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=300,
                                     **kwargs)

            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "\nTrial #", trial
Beispiel #31
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in xrange (1,10,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put')

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
Beispiel #32
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            key2 = csvFilename + "_" + str(trial) + ".hex"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Beispiel #33
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        trial = 1
        for x in xrange(1, 10, 1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseKey = h2o_cmd.parseFile(None, csvPathname)

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRFOnly(parseKey=parseKey,
                              trees=237,
                              depth=45,
                              timeoutSecs=120)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
    def test_1ktrees_job_cancel_many_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range (1,5):
            # random 0 or 1 delay
            delay = random.uniform(0,1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3)
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
Beispiel #35
0
 def test_A_randomdata2(self):
     print "Using smalldata/datagen1.csv as is"
     csvPathname = h2o.find_file('smalldata/datagen1.csv')
     h2o_cmd.runRF(trees=1,
                   response_variable=2,
                   timeoutSecs=10,
                   csvPathname=csvPathname)
Beispiel #36
0
    def test_rf3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in range (1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put', pollTimeoutSecs=60, timeoutSecs=60)

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, max_depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
Beispiel #37
0
    def test_rf_1ktrees_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [500]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Beispiel #38
0
 def test_badchars(self):
     print "badchars.csv has some 0x0 (<NUL>) characters."
     print "They were created by a dd that filled out to buffer boundary with <NUL>"
     print "They are visible using vim/vi"
     
     csvPathname = h2o.find_file('smalldata/badchars.csv')
     h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname)
Beispiel #39
0
 def test_A_putfile(self):
     cvsfile = h2o.find_file(file_to_put())
     node = h2o.nodes[0]
     key = node.put_file(cvsfile)
     resultSize = node.inspect(key)['value_size_bytes']
     origSize = h2o.get_file_size(cvsfile)
     self.assertEqual(origSize, resultSize)
Beispiel #40
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl',
                                   shCmdString.split(),
                                   timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            h2o_cmd.runRF(trees=trees,
                          timeoutSecs=timeoutSecs,
                          csvPathname=csvPathname)
            trees += 10
Beispiel #41
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            # FIX! TBD do we always have to kick off the run from node 0?
            # what if we do another node?
            # FIX! do we need or want a random delay here?
            h2o_cmd.runRF(trees=trees,
                          timeoutSecs=timeoutSecs,
                          csvPathname=csvPathname)
            trees += 10
            sys.stdout.write('.')
            sys.stdout.flush()
Beispiel #42
0
 def test_prostate_then_prostate_long_parse(self):
     print "\nput and parse of same file, but both key and key2 are the h2o defaults..always different"
     for trial in range(10):
         start = time.time()
         key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate_long.csv.gz"))
         print "trial #", trial, "parse end on ", "prostate_long.csv.gz", "took", time.time() - start, "seconds"
         h2o.check_sandbox_for_errors()
Beispiel #43
0
 def test_E_ParseManyCols(self):
     csvPathname = h2o.find_file('smalldata/fail1_100x11000.csv.gz')
     parseKey = h2o_cmd.parseFile(None, csvPathname, timeoutSecs=10)
     inspect = h2o_cmd.runInspect(None,
                                  parseKey['destination_key'],
                                  offset=-1,
                                  view=5)
Beispiel #44
0
    def test_import_file(self):
        timeoutSecs = 500
        cAll = [
            'smalldata/jira/v-3.csv',
            'smalldata/jira/v-3.csv',
            'smalldata/jira/v-3.csv',
            'smalldata/jira/v-3.csv',
        ]

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for c in cAll:

            for i in range(10):
                # race between remove and import?
                csvPathname = h2o.find_file('smalldata/jira/v-3.csv')
                h2o.nodes[0].remove_all_keys()
                importResult = h2o.nodes[0].import_files(csvPathname,
                                                         timeoutSecs=15)
                h2o.verboseprint(h2o.dump_json(importResult))
                files = importResult['files']
                keys = importResult['keys']
                fails = importResult['fails']
                dels = importResult['dels']

                if len(files) == 0:
                    raise Exception("empty files: %s after import" % files)
                if len(keys) == 0:
                    raise Exception("empty keys: %s after import" % keys)
                if len(fails) != 0:
                    raise Exception("non-empty fails: %s after import" % fails)
                if len(dels) != 0:
                    raise Exception("non-empty dels: %s after import" % dels)
Beispiel #45
0
    def testAll(self):
        try:
            h2o.build_cloud(node_count=2)

            # we don't have the port or ip configuration here
            # that util/h2o.py does? Keep this in synch with spawn_h2o there.
            # also don't have --nosigar here?
            (ps, stdout, stderr) = h2o.spawn_cmd('junit', [
                    'java',
                    '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'),
                    '-Dh2o.arg.name=pytest-'+getpass.getuser(),
                    '-Dh2o.arg.ip='+h2o.get_ip_address(),
                    '-ea', '-jar', h2o.find_file('target/h2o.jar'),
                    '-mainClass', 'org.junit.runner.JUnitCore',
                    # The tests
                    'water.parser.ParserTest',
                    ])

            rc = ps.wait(None)
            out = file(stdout).read()
            err = file(stderr).read()
            if rc is None:
                ps.terminate()
                raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
            elif rc != 0:
                raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))

        finally:
            h2o.tear_down_cloud()
Beispiel #46
0
    def test_rf_big1_nopoll(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(1):
            start = time.time()
            kwargs = {}
            # FIX! what model keys do these get?
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey, model_key="RF_model"+str(jobDispatch),\
                timeoutSecs=300, noPoll=True, **kwargs)
            rfViewInitial.append(rfView)
            print "rf job dispatch end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

        h2o_jobs.pollWaitJobs(pattern='GLMModel',
                              timeoutSecs=30,
                              pollTimeoutSecs=120,
                              retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for rfView in rfViewInitial:
            print "Checking completed job, with no polling:", rfView
            a = h2o.nodes[0].poll_url(rf['response'], noPoll=True)
            h2o_rf.simpleCheckRFView(None, a)
Beispiel #47
0
    def test_B_benign_w_predict(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()

        print "\nStarting benign.csv"
        csvFilename = "benign.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        y = "3"
        # cols 0-13. 3 is output
        # no member id in this one
        for maxx in range(11,14):
            x = range(maxx)
            x.remove(3) # 3 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y}
            # fails with n_folds
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']
            print "Doing predict with same dataset, and the GLM model"
            h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key'])
Beispiel #48
0
    def test_C_RF_poker100(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10)

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 60
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            trees += 10
Beispiel #49
0
    def test_C_prostate_w_predict(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        x = ""
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,6):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y, 'n_folds': 5}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']
            print "Doing predict with same dataset, and the GLM model"
            h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key'])

        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
    def test_GLM_params_rand2(self):
        # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k")

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Beispiel #51
0
 def test_B_benign(self):
     print "\nStarting benign.csv"
     csvFilename = "benign.csv"
     csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
     # columns start at 0
     y = "3"
     # cols 0-13. 3 is output
     # no member id in this one
     for maxx in range(4,14):
         x = range(maxx)
         x.remove(3) # 3 is output
         x = ",".join(map(str,x))
         print "\nx:", x
         print "y:", y
         
         # solver can be ADMM
         kwargs = {'x': x, 'y':  y,\
              'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1}
         # fails with n_folds
         print "Not doing n_folds with benign. Fails with 'unable to solve?'"
         glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
         # no longer look at STR?
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         sys.stdout.write('.')
         sys.stdout.flush() 
Beispiel #52
0
    def test_C_prostate(self):
        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        x = ""
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,6):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            kwargs = {'x': x, 'y':  y, 'n_folds': 5}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            sys.stdout.write('.')
            sys.stdout.flush() 

        h2o.nodes[0].log_view()
        namelist = h2o.nodes[0].log_download()
Beispiel #53
0
    def test_C_prostate(self):
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,9):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            # solver can be ADMM. standardize normalizes the data.
            kwargs = {'x': x, 'y':  y, 'n_folds': 5,\
                'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            h2o.check_sandbox_for_errors()
            sys.stdout.write('.')
            sys.stdout.flush() 
Beispiel #54
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # build up the parameter string in X
        y = "106"
        x = ""

        # go right to the big X and iterate on that case
        ### for trial in range(2):
        for trial in range(2):
            print "\nTrial #", trial, "start"
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'y': y}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            h2o.check_sandbox_for_errors()
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")
            print "\nTrial #", trial
Beispiel #55
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange(11, 100, 10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange(11, 60, 10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult,
                          ntrees=trees,
                          timeoutSecs=timeoutSecs)
            trees += 10