def test_sort_of_prostate_with_row_schmoo(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10000 write_syn_dataset(csvPathname, totalRows, headerData) for trial in range (5): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, num) totalRows += num start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? kwargs = {'ntree': 5, 'depth': 5} parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, hex_key=hex_key, timeoutSecs=10, pollTimeoutSecs=5, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_rf_strata_fail(self): csvPathname ='UCI/UCI-large/covtype/covtype.data' timeoutSecs = 60 kwargs = { 'response_variable': 54, 'ntree': 50, 'features': '', 'depth': 2147483647, 'stat_type': 'ENTROPY', 'ignore': '', 'class_weights': '1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0', 'sampling_strategy': 'RANDOM', 'strata_samples': 'undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined', 'sample': '67', 'out_of_bag_error_estimate': 1, 'model_key': '', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': '', 'iterative_cm': 1, 'use_non_local_data': 0, } parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange (10,100,10): shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange (10,100,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange (1,10,1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseKey = h2o_cmd.parseFile(None, csvPathname) h2o.verboseprint("Trial", trial) h2o_cmd.runRFOnly(parseKey=parseKey, trees=237, depth=45, timeoutSecs=120) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,3): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRFOnly(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in xrange (1,5,1): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF key = h2o.nodes[0].put_file(csvPathname) parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex") h2o.verboseprint("Trial", trial) start = time.time() cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_A_randomdata2(self): print "Using datagen1.csv as-is" csvPathname = 'datagen1.csv' # have to give the separator == comma...otherwise H2O can't deduce it on this dataset parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10, header=1, separator=44) h2o_cmd.runRFOnly(parseResult=parseResult, trees=1, response_variable=2, timeoutSecs=20)
def test_badchars(self): print "badchars.csv has some 0x0 (<NUL>) characters." print "They were created by a dd that filled out to buffer boundary with <NUL>" print "They are visible using vim/vi" csvPathname = 'badchars.csv' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, trees=50, timeoutSecs=10)
def test_RFhhp(self): csvPathname = 'hhp.cut3.214.data.gz' print "RF start on ", csvPathname, "this will probably take 1 minute.." start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, trees=200, timeoutSecs=400, retryDelaySecs=15) print "RF end on ", csvPathnamegz, 'took', time.time() - start, 'seconds'
def test_tree_view_wrong_model(self): csvPathname = 'poker/poker1000' hex_key = csvPathname + ".hex" # tree view failed with poker1000, passed with iris parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) h2o_cmd.runRFOnly(parseResult=parseResult, trees=1, model_key="model0", timeoutSecs=10) for n in range(1): # Give it the wrong model_key name. This caused a stack track a = h2o_cmd.runRFTreeView(n=n, data_key=hex_key, model_key="wrong_model_name", timeoutSecs=10, ignoreH2oError=True)
def test_loop_random_param_covtype(self): csvPathname = 'UCI/UCI-large/covtype/covtype.data' for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_RFhhp(self): csvPathname = 'hhp_107_01.data.gz' print "\nRF start on ", csvPathname, "this will probably take a minute.." start = time.time() kwargs = { 'class_weights': '0=1,1=10', } parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, trees=100, timeoutSecs=120, retryDelaySecs=10, **kwargs) print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_loop_random_param_poker1000(self): csvPathname = 'poker/poker1000' for trial in range(20): # params is mutable. This is default. params = {'ntree': 19, 'parallel': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2(self): csvPathname = 'UCI/UCI-large/covtype/covtype.data' for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_tnc3_ignore(self): csvPathname = 'tnc3.csv' print "\n" + csvPathname hex_key = "tnc3.hex" ### h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10, header=1) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if 1==1: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser" print 'The good case with ignore="boat,body"' rfv = h2o_cmd.runRFOnly(parseResult=parseResult, trees=5, timeoutSecs=10, ignore="boat,body") inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if 1==0: colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if 1==1: print "\nNow the bad case (no ignore)" rfv = h2o_cmd.runRFOnly(parseResult=parseResult, trees=5, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_file_with_nul_chars_inserted(self): SYNDATASETS_DIR = h2o.make_syn_dir() # we're going to insert <NUL> (0x0) in between every byte! # and then use it. move to a large file. I suppose # we could compare the results to a non-munged file with the same algo # I suppose the <NUL> are thrown away by parse, so doesn't change # chunk boundary stuff. (i.e. not interesting test for RF) csvFilename = 'poker1000' csvPathname = 'poker/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=true) nulFilename = "syn_nul.data" nulPathname = SYNDATASETS_DIR + '/' + nulFilename piece_size = 4096 # 4 KiB with open(fullPathname, "rb") as in_file: with open(nulPathname, "wb") as out_file: while True: piece = in_file.read(103) if piece == "": break # end of file # we could just extend piece? # start with a null withNuls = bytearray(piece) # FIX! we'll eventually stick a <NUL> after every byte! withNuls.extend(bytearray.fromhex('00')) out_file.write(withNuls) for trials in xrange(1,2): trees = 6 for x in xrange (161,240,40): y = 10000 * x print "\nTrial:", trials, ", y:", y timeoutSecs = 20 + 5*(len(h2o.nodes)) model_key = csvFilename + "_" + str(trials) parseResult = h2i.import_parse(path=nulPathname, schema='put') h2o_cmd.runRFOnly(parseResult = trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) sys.stdout.write('.') sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()
def test_RF_1000trees(self): # NAs cause CM to zero..don't run for now ### csvPathnamegz = h2o.find_file('smalldata/hhp_9_17_12.predict.100rows.data.gz') s3bucket = self.s3_default_bucket() s3dataset = "covtype20x.data.gz" s3dataset = "covtype.data" s3dataset = "covtype200x.data.gz" s3dataset = "covtype50x.data" s3dataset = "covtype100x.data" s3dataset = "covtype.20k.data" s3dataset = "covtype.data" start = time.time() parseKey = h2o_cmd.parseS3File(bucket=s3bucket, filename=s3dataset, timeoutSecs=14800) print "Parsing took {0}".format(time.time() - start) start = time.time() rf_train = h2o_cmd.runRFOnly( parseKey=parseKey, ntree=100, timeoutSecs=14800, bin_limit=20000, out_of_bag_error_estimate=1, gini=0, depth=100, exclusive_split_limit=0, ) print "Computation took {0} sec".format(time.time() - start) print h2o_rf.pp_rf_result(rf_train)
def test_from_import(self): timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", ] # pop open a browser on the cloud # h2b.browseTheCloud() for csvFilename in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='put', hex_key=hex_key, timeoutSecs=500) if not h2o.beta_features: print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) if not h2o.beta_features: RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "TEST-poker1000.csv", "leads.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", # these can't RF ..output classes not integer? # "bestbuy_test.csv", # "bestbuy_train.csv", "covtype.data", "covtype.4x.shuffle.data", "covtype4x.shuffle.data", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", # "prostate_2g.csv", # "prostate_long.csv.gz", "prostate_long_1G.csv", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", # "poker_c1s1_testing_refresh.csv", # "3G_poker_shuffle", # "billion_rows.csv.gz", # "poker-hand.1244M.shuffled311M.full.txt", ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
def test_rf_params_rand2(self): csvPathname = 'UCI/UCI-large/covtype/covtype.data' kwargs = { 'response_variable': 54, 'features': 7, 'sampling_strategy': 'STRATIFIED_LOCAL', 'out_of_bag_error_estimate': 1, 'strata_samples': '1=10,2=99,3=99,4=99,5=99,6=99,7=99', 'bin_limit': None, 'seed': '11111', 'model_key': '012345', 'ntree': 13, 'parallel': 1 } for trial in range(2): # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put') rfv = h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start cm = rfv['confusion_matrix'] classification_error = cm['classification_error'] rows_skipped = cm['rows_skipped'] # just want to catch the nan case when all rows are skipped self.assertLess(rows_skipped, 581012) self.assertLess(classification_error, 100) # error if nan print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_putfile_a5m(self): timeoutSecs = 500 csvFilenameList = [ # use different names for each parse # doesn't fail if gzipped? ("a5m.csv", 'A', None), ("a5m.csv", 'B', None), ("a5m.csv", 'C', None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, key, trees) in csvFilenameList: csvPathname = csvFilename # creates csvFilename and csvFilename.hex keys parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameList = [ "airlines_88_08_100lines.csv", ] h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
def test_rf_allyears2k_oobe(self): importFolderPath = '/home/0xdiag/datasets' csvFilename = 'allyears2k.csv' csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) for trial in range(10): kwargs = paramDict timeoutSecs = 30 + kwargs['ntree'] * 2 start = time.time() # randomize the node node = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] rfView = h2o_cmd.runRFOnly(node=node, parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix']['classification_error'] rows_skipped = rfView['confusion_matrix']['rows_skipped'] mtry = rfView['mtry'] mtry_nodes = rfView['mtry_nodes'] print "mtry:", mtry print "mtry_nodes:", mtry_nodes self.assertEqual(classification_error, 0, "Should have zero oobe error") self.assertEqual(rows_skipped, 39, "Should have exactly 39 rows skipped") print "Trial #", trial, "completed"
def test_B_putfile_files(self): timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameList = [ ("covtype.data", 'UCI/UCI-large/covtype/covtype.data', 1), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, csvPathname, trees) in csvFilenameList: parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, timeoutSecs=500, schema='put') print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect2 = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) sys.stdout.write('.') sys.stdout.flush()
def test_rf_params_rand2(self): csvPathname = 'space_shuttle_damage.csv' for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'stat_type': 'ENTROPY', 'class_weights': 'yes=1000', 'ntree': 50, 'parallel': 1, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10) start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') rfView = h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" # why does this vary between 22 and 23 self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes' self.assertLess(classErrorPctList[0],95) # class 0 is 'no' self.assertLess(classErrorPctList[1],29) # class 1 is 'yes' self.assertLess(classification_error,61)
def test_F_no_mc_loop(self): print "\nwith flatfile, with multicast disabled, and RF, 5 trials" allAcceptIptables() multicastDropReceiveIptables() showIptables() for x in range(1,5): h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True) parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, trees=50, timeoutSecs=10) h2o.tear_down_cloud() h2o.verboseprint("Waiting", nodes_per_host, "seconds to avoid OS sticky port problem") time.sleep(nodes_per_host) print "Trial", x sys.stdout.write('.') sys.stdout.flush()
def test_rf_sample(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" print "just going to see if rf is doing the sampling right for one tree on 100000 rows" rList = rand_rowData() totalRows = 10000 write_syn_dataset(csvPathname, totalRows, headerData, rList) for trial in range (2): # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 30 parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, header=1) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalRows:", totalRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' cm = rfv['confusion_matrix'] rows_skipped = cm['rows_skipped'] # the sample is what we trained on. The CM for one tree is what's left # it's not perfectly accurate..allow +-2 # NEW: after the # of trees is big enough, all the data is used, so we really can't compare # any more sample = kwargs['sample'] rowsUsed = sample * totalRows/100 rowsNotUsed = totalRows - rowsUsed ## print "Allowing delta of 0-2" ## print "predicted CM rows (rowsNotUsed):", rowsNotUsed, "actually:", totalRows - rows_skipped, "rows_skipped:", rows_skipped ## self.assertAlmostEqual(rowsNotUsed, totalRows - rows_skipped, delta=2) h2o.check_sandbox_for_errors()
def test_parse_bounds_libsvm(self): # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 1), # FIX! fails KMeansScore # not integer output # ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), ("mushrooms.svm", "cG", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 30, 1), ("news20.svm", "cH", 30, 1), ("tmc2007_train.svm", "cJ", 30, 1), ("covtype.binary.svm", "cC", 30, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse bucket = "home-0xdiag-datasets" csvPathname = "libsvm/" + csvFilename # PARSE****************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print csvPathname, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) # RF****************************************** kwargs = { 'ntree': 6, 'response_variable': 0, } timeoutSecs = 600 start = time.time() rf = h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
def test_B_randomdata2_1_lineend(self): print "Using smalldata/datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" # change lineend, case 1 csvPathname1 = h2o.find_file('smalldata/datagen1.csv') csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv' infile = open(csvPathname1, 'r') outfile = open(csvPathname2,'w') # existing file gets erased # assume all the test files are unix lineend. # I guess there shouldn't be any "in-between" ones # okay if they change I guess. for line in infile.readlines(): outfile.write(line.strip("\n") + "\r") infile.close() outfile.close() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname2, timeoutSecs=10, header=1, separator=44) h2o_cmd.runRFOnly(parseKey=parseKey, trees=1, response_variable=2, timeoutSecs=10, csvPathname=csvPathname2)
def test_rf_covtype_fvec(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(jobDispatch) # don't poll for fvec rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) elapsed = time.time() - start print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print h2o.dump_json(rfResult) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = kwargs['model_key'] rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
def test_RF_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 10 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_reals_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_RF_mnist_both(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, parsePattern, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 10 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, # 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0, sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual( classification_error, 0.0004, delta=0.0003, msg="Classification error %s differs too much" % classification_error) leaves = rfView['trees']['leaves'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100 d = "seed: %s leaves %s %s %s pct. different %s" % ( params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = rfView['trees']['depth'] depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100 d = "seed: %s depth %s %s %s pct. different %s" % ( params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_rf_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols (10, 50, 5000, 'cA', 600), (50, 50, 5000, 'cB', 600), (100, 50, 5000, 'cC', 600), (500, 50, 5000, 'cD', 600), (1000, 50, 5000, 'cE', 600), (5000, 50, 5000, 'cF', 600), (6000, 50, 5000, 'cF', 600), (7000, 50, 5000, 'cF', 600), (8000, 50, 5000, 'cF', 600), (9000, 50, 5000, 'cF', 600), (10000, 50, 5000, 'cF', 600), ] ### h2b.browseTheCloud() paramDict = { 'class_weight': None, 'ntree': 10, 'model_key': 'model_keyA', 'out_of_bag_error_estimate': 1, 'stat_type': 'GINI', 'depth': 2147483647, 'bin_limit': 10000, 'parallel': 1, 'sample': 80, 'exclusive_split_limit': 0, } trial = 0 for (FILEREPL, rowCount, colCount, key2, timeoutSecs) in tryList: trial += 1 SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseKey = make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs) paramDict['response_variable'] = colCount - 1 paramDict['features'] = 9 paramDict['seed'] = random.randint(0, sys.maxint) kwargs = paramDict.copy() start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", parseKey['python_source_key'], 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix'][ 'classification_error'] ### self.assertLess(classification_error, 0.7, "Should have full classification error <0.7") algo = "RF " l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs. trees: {:d} Error: {:6.2f} \ num_rows: {:d} num_cols: {:d} value_size_bytes: {:d}' .format( len(h2o.nodes), tryHeap, algo, parseKey['python_source_key'], elapsed, kwargs['ntree'], \ classification_error, parseKey['num_rows'], parseKey['num_cols'], parseKey['value_size_bytes']) print l h2o.cloudPerfH2O.message(l) print "Trial #", trial, "completed"
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "/home/0xdiag/datasets/standard" csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on", csvFilename parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] # don't use the smaller samples..bad error rates, plus for sorted covtype, you can get just one class! for trial in range(8,9): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r_" + csvFilename + "_" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) # hack so the RF will use the sliced result # FIX! don't use the sliced bit..use the whole data for rf training below ### parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + csvFilename + "_" + str(trial) # kwargs['model_key'] = "model" # double check the rows/cols inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, "going into RF") start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 kwargs['no_confusion_matrix'] = 0 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 # double check the rows/cols inspect = h2o_cmd.runInspect(key=dataKeyTest) h2o_cmd.infoFromInspect(inspect, "dataKeyTest") rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfvScoring['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp # return the last rfv done during training return rfv
def test_rf_big1_nopoll(self): csvFilename = 'hhp_107_01.data.gz' csvPathname = h2o.find_file("smalldata/" + csvFilename) key2 = csvFilename + ".hex" print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=15) firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete nopoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['ntree'] = 7 + jobDispatch else: kwargs['ntree'] = 7 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runRFOnly(node=randomNode, parseKey=parseKey, model_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = model_key rfView['ntree'] = kwargs['ntree'] print "rf job dispatch end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # a = h2o.nodes[0].random_forest_view(data_key, model_key, noPoll=True) print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "TEST-poker1000.csv", "leads.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", # these can't RF ..output classes not integer? # "bestbuy_test.csv", # "bestbuy_train.csv", "covtype.data", "covtype.4x.shuffle.data", "covtype4x.shuffle.data", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", # "prostate_2g.csv", # "prostate_long.csv.gz", "prostate_long_1G.csv", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", # "poker_c1s1_testing_refresh.csv", # "3G_poker_shuffle", # "billion_rows.csv.gz", # "poker-hand.1244M.shuffled311M.full.txt", ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1, parseKey=parseKey, timeoutSecs=2000) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_rfview_score(self): csvPathnameTrain = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') print "Train with:", csvPathnameTrain parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15) dataKeyTrain = parseKeyTrain['destination_key'] csvPathnameTest = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') print "Test with:", csvPathnameTest parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15) dataKeyTest = parseKeyTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = { 'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0 } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # new web page for predict? throw it in here for now start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype_train_oobe2(self): print "\nUse randomBitVector and filter to separate the dataset randomly" importFolderPath = "/home/0xdiag/datasets" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" print "\nUsing header=0 on the normal covtype.data" # don't import it, just so we don't have all the key names cluttering the view all # in the browser parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # this was with 10 trees # expectTrainPctRightList = [0, 85.27, 88.45, 89.99, 91.11, 91.96, 92.51, 93.03, 93.45, 93.78] # expectScorePctRightList = [0, 89.10, 91,90, 93.26, 94.25, 94.74, 95.10, 95.42, 95.72, 95.92] # 0 isn't used expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # start at 90% rows + 1 # randomBitVector(size,selected) # randomFilter(srcFrame,rows,seed) # filter(srcFrame,bitVect) # odd. output is byte, all other exec outputs are 8 byte? (at least the ones below?) execExpr = "rbv=randomBitVector(" + str(num_rows) + "," + str( last10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey="rbv", timeoutSecs=10) # complement the bit vector execExpr = "not_rbv=colSwap(rbv,0,rbv[0]==0?1:0)" h2o_exec.exec_expr(None, execExpr, resultKey="not_rbv", timeoutSecs=10) execExpr = dataKeyTest + "=filter(" + key2 + ",rbv)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=filter(" + key2 + ",not_rbv)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) ### time.sleep(3600) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1, 10): # always slice from the beginning rowsToUse = rowsForPct[trial % 10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str( rowsToUse) + ")" execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * ( 1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # no_confusion_matrix=1& # clear_confusion_matrix=1 ### dataKeyTest = data_key kwargs['clear_confusion_matrix'] = 1 kwargs['no_confusion_matrix'] = 0 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) fullScorePctRight = 100 * ( 1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / num_rows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "expectTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "expectScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def test_rf_covtype_train_oobe3(self): print "\nUse randomFilter to sample the dataset randomly. then slice it" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # FIX! too many digits (10) in the 2nd param seems to cause stack trace execExpr = dataKeyTest + "=randomFilter(" + key2 + "," + str(pct10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=randomFilter(" + key2 + "," + str(rowsForPct[9]) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1,10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")" # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp