def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" x = "" # cols 0-13. 3 is output # no member id in this one for appendx in xrange(14): if (appendx == 3): print "\n3 is output." else: if x == "": x = str(appendx) else: x = x + "," + str(appendx) csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with num_cross_validation_folds print "Not doing num_cross_validation_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_R(self): rScript = h2o.find_file('R/tests/test_R_GLM_basic.R') rLibrary = h2o.find_file('R/H2O_S4.R') shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) h2o.spawn_wait(ps, outpath, errpath, timeout=10)
def test_R_RF_diff_class(self): print "\nStarting iris.csv class weight test" rScript = h2o.find_file('R/tests/test_R_RF_diff_class.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Compare results from different class weights h2o_R.do_R(rScript, rLibrary)
def test_rf_model_key_unique(self): modelKeyDict = {} for trial in range(1, 5): if trial == 1: csvPathname = h2o.find_file('smalldata/iris/iris.csv') else: csvPathname = h2o.find_file('smalldata/iris/iris2.csv') start = time.time() rfResult = h2o_cmd.runRF(trees=6, timeoutSecs=10, rfView=False, csvPathname=csvPathname) print "RF #%d" % trial, "started on ", csvPathname, 'took', time.time( ) - start, 'seconds' model_key = rfResult['model_key'] print "model_key:", model_key if model_key in modelKeyDict: raise Exception( "same model_key used in RF #%d that matches prior RF #%d" % (trial, modelKeyDict[model_key])) modelKeyDict[model_key] = trial # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a)
def test_R_RF_diff_ignore(self): print "\nStarting iris.csv ignore predictor(s) test" rScript = h2o.find_file('R/tests/test_R_RF_diff_ignore.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Ignore successively more predictor columns h2o_R.do_R(rScript, rLibrary)
def test_R_C_kmeans_prostate(self): print "\nStarting prostate.csv" rScript = h2o.find_file('R/tests/test_R_C_kmeans_prostate.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Run k-means with k = 5 on column 2 (Age) # Loop to see if we get same centers h2o_R.do_R(rScript, rLibrary)
def test_R_B_kmeans_benign(self): print "\nStarting benign.csv" rScript = h2o.find_file('R/tests/test_R_B_kmeans_benign.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Run k-means with k = 3 on all columns # Loop to see if we get same centers h2o_R.do_R(rScript, rLibrary)
def test_R_RF_diff_class(self): print "\nStarting iris.csv class weight test" rScript = h2o.find_file('R/tests/test_R_RF_diff_class.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Compare results from different class weights shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10) if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
def test_R_RF_diff_ignore(self): print "\nStarting iris.csv ignore predictor(s) test" rScript = h2o.find_file('R/tests/test_R_RF_diff_ignore.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Ignore successively more predictor columns shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10) if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
def test_R_C_kmeans_prostate(self): print "\nStarting prostate.csv" rScript = h2o.find_file('R/tests/test_R_C_kmeans_prostate.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Run k-means with k = 5 on column 2 (Age) # Loop to see if we get same centers shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10) if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
def test_R_C_prostate(self): print "\nStarting prostate.csv" rScript = h2o.find_file('R/tests/test_R_C_prostate.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Columns start at 0 # Test columns 1-8, with 1 as response # (Skip 0 because member ID) shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10) if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
def test_R_B_benign(self): print "\nStarting benign.csv" rScript = h2o.find_file('R/tests/test_R_B_benign.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Columns start at 0 # Test columns 0-13, with 3 as response # N-fold cross-validation = 5 shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10) if(rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
def test_R_C_kmeans_prostate(self): print "\nStarting prostate.csv" rScript = h2o.find_file('R/tests/test_R_C_kmeans_prostate.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Run k-means with k = 5 on column 2 (Age) # Loop to see if we get same centers shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[ 0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10) if (rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
def test_R_C_prostate(self): print "\nStarting prostate.csv" rScript = h2o.find_file('R/tests/test_R_C_prostate.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Columns start at 0 # Test columns 1-8, with 1 as response # (Skip 0 because member ID) shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[ 0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10) if (rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
def test_GenParity1(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange(10, 100, 10): shCmdString = "perl " + parityPl + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange(10, 100, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_params_rand2_newargs(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') key = 'covtype.20k' parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) # pop open a browser on the cloud h2b.browseTheCloud() # build up the parameter string in X y = "106" x = "" # go right to the big X and iterate on that case ### for trial in range(2): for trial in range(2): print "\nTrial #", trial, "start" print "\nx:", x print "y:", y start = time.time() kwargs = {'y': y} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) h2o.check_sandbox_for_errors() ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") print "\nTrial #", trial
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange (10,100,10): shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange (10,100,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def notest_RF_iris2(self): trees = 6 timeoutSecs = 20 csvPathname = h2o.find_file('smalldata/iris/iris2.csv') h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname)
def inspect_columns(self, filename, rows=1, cols=26, columnNames=crange('A', 'Z'), columnTypes=None): cvsfile = h2o.find_file(filename) node = h2o.nodes[0] res = h2o_cmd.parseFile(node=node, csvPathname=cvsfile) ary = node.inspect(res['destination_key']) self.assertEqual(rows, ary['num_rows']) self.assertEqual(cols, ary['num_cols']) # check column names if not columnNames is None: for (col, expName) in zip(ary['cols'], columnNames): self.assertEqual(expName, col['name']) # check column types if not columnTypes is None: for (col, expType) in zip(ary['cols'], columnTypes): self.assertEqual(expType, col['type']) return ary
def test_many_cols_with_syn(self): ### h2b.browseTheCloud() csvFilename = "logreg_trisum_int_cat_10000x10.csv" csvPathname = "smalldata/logreg/" + csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(None, h2o.find_file(csvPathname), key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict = define_params() paramDict2 = {} for k in paramDict: # sometimes we have a list to pick from in the value. now it's just list of 1. paramDict2[k] = paramDict[k][0] y = 10 # FIX! what should we have for case? 1 should be okay because we have 1's in output col kwargs = {'y': y, 'max_iter': 50} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=20, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = ( "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR ) h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 3): sys.stdout.write(".") sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30 ) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds" print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # what if we do another node? # FIX! do we need or want a random delay here? h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10 sys.stdout.write('.') sys.stdout.flush()
def notest_RF_poker100(self): trees = 6 timeoutSecs = 20 csvPathname = h2o.find_file('smalldata/poker/poker100') h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname)
def test_C_RF_poker100(self): parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10) SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 60 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10
def test_A_putfile(self): cvsfile = h2o.find_file(file_to_put()) node = h2o.nodes[0] key = node.put_file(cvsfile) resultSize = node.inspect(key)['value_size_bytes'] origSize = h2o.get_file_size(cvsfile) self.assertEqual(origSize,resultSize)
def test_R_B_benign(self): print "\nStarting benign.csv" rScript = h2o.find_file('R/tests/test_R_B_benign.R') rLibrary = h2o.find_file('R/H2O_Load.R') # Columns start at 0 # Test columns 0-13, with 3 as response # N-fold cross-validation = 5 shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[ 0].http_addr + ":" + str(h2o.nodes[0].port) (ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split()) rc = h2o.spawn_wait(ps, outpath, errpath, timeout=10) if (rc != 0): raise Exception("R exited with non-zero return code %s" % rc)
def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname y = "106" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) for trial in xrange(3): sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\nTrial #", trial
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange (1,10,1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange(1, 10, 1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseKey = h2o_cmd.parseFile(None, csvPathname) h2o.verboseprint("Trial", trial) h2o_cmd.runRFOnly(parseKey=parseKey, trees=237, depth=45, timeoutSecs=120) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_1ktrees_job_cancel_many_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range (1,5): # random 0 or 1 delay delay = random.uniform(0,1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def test_A_randomdata2(self): print "Using smalldata/datagen1.csv as is" csvPathname = h2o.find_file('smalldata/datagen1.csv') h2o_cmd.runRF(trees=1, response_variable=2, timeoutSecs=10, csvPathname=csvPathname)
def test_rf3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in range (1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put', pollTimeoutSecs=60, timeoutSecs=60) h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, max_depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_rf_1ktrees_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [500]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_badchars(self): print "badchars.csv has some 0x0 (<NUL>) characters." print "They were created by a dd that filled out to buffer boundary with <NUL>" print "They are visible using vim/vi" csvPathname = h2o.find_file('smalldata/badchars.csv') h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname)
def test_A_putfile(self): cvsfile = h2o.find_file(file_to_put()) node = h2o.nodes[0] key = node.put_file(cvsfile) resultSize = node.inspect(key)['value_size_bytes'] origSize = h2o.get_file_size(cvsfile) self.assertEqual(origSize, resultSize)
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=30) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # what if we do another node? # FIX! do we need or want a random delay here? h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10 sys.stdout.write('.') sys.stdout.flush()
def test_prostate_then_prostate_long_parse(self): print "\nput and parse of same file, but both key and key2 are the h2o defaults..always different" for trial in range(10): start = time.time() key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate_long.csv.gz")) print "trial #", trial, "parse end on ", "prostate_long.csv.gz", "took", time.time() - start, "seconds" h2o.check_sandbox_for_errors()
def test_E_ParseManyCols(self): csvPathname = h2o.find_file('smalldata/fail1_100x11000.csv.gz') parseKey = h2o_cmd.parseFile(None, csvPathname, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], offset=-1, view=5)
def test_import_file(self): timeoutSecs = 500 cAll = [ 'smalldata/jira/v-3.csv', 'smalldata/jira/v-3.csv', 'smalldata/jira/v-3.csv', 'smalldata/jira/v-3.csv', ] # pop open a browser on the cloud # h2b.browseTheCloud() for c in cAll: for i in range(10): # race between remove and import? csvPathname = h2o.find_file('smalldata/jira/v-3.csv') h2o.nodes[0].remove_all_keys() importResult = h2o.nodes[0].import_files(csvPathname, timeoutSecs=15) h2o.verboseprint(h2o.dump_json(importResult)) files = importResult['files'] keys = importResult['keys'] fails = importResult['fails'] dels = importResult['dels'] if len(files) == 0: raise Exception("empty files: %s after import" % files) if len(keys) == 0: raise Exception("empty keys: %s after import" % keys) if len(fails) != 0: raise Exception("non-empty fails: %s after import" % fails) if len(dels) != 0: raise Exception("non-empty dels: %s after import" % dels)
def testAll(self): try: h2o.build_cloud(node_count=2) # we don't have the port or ip configuration here # that util/h2o.py does? Keep this in synch with spawn_h2o there. # also don't have --nosigar here? (ps, stdout, stderr) = h2o.spawn_cmd('junit', [ 'java', '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'), '-Dh2o.arg.name=pytest-'+getpass.getuser(), '-Dh2o.arg.ip='+h2o.get_ip_address(), '-ea', '-jar', h2o.find_file('target/h2o.jar'), '-mainClass', 'org.junit.runner.JUnitCore', # The tests 'water.parser.ParserTest', ]) rc = ps.wait(None) out = file(stdout).read() err = file(stderr).read() if rc is None: ps.terminate() raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) elif rc != 0: raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) finally: h2o.tear_down_cloud()
def test_rf_big1_nopoll(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(1): start = time.time() kwargs = {} # FIX! what model keys do these get? rfView = h2o_cmd.runRFOnly(parseKey=parseKey, model_key="RF_model"+str(jobDispatch),\ timeoutSecs=300, noPoll=True, **kwargs) rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=30, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for rfView in rfViewInitial: print "Checking completed job, with no polling:", rfView a = h2o.nodes[0].poll_url(rf['response'], noPoll=True) h2o_rf.simpleCheckRFView(None, a)
def test_B_benign_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with n_folds glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key'])
def test_C_prostate_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key']) h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_GLM_params_rand2(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k") # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(4,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM kwargs = {'x': x, 'y': y,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_C_prostate(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) sys.stdout.write('.') sys.stdout.flush() h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,9): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM. standardize normalizes the data. kwargs = {'x': x, 'y': y, 'n_folds': 5,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10