def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) # pop open a browser on the cloud h2b.browseTheCloud() # build up the parameter string in X y = "106" x = "" # go right to the big X and iterate on that case ### for trial in range(2): for trial in range(2): print "\nTrial #", trial, "start" print "\nx:", x print "y:", y start = time.time() kwargs = {'y': y} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) h2o.check_sandbox_for_errors() ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") print "\nTrial #", trial
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameList = [ "airlines_88_08_100lines.csv", ] h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1, parseKey=parseKey, timeoutSecs=2000)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = ["covtype.data"] else: csvFilenameList = [ "covtype200x.data", "covtype200x.data", "covtype.data", "covtype.data", "covtype20x.data", "covtype20x.data", ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = "/home/0xdiag/datasets/standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey["destination_key"]) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm["GLMModel"] coefficients = GLMModel["coefficients"] validationsList = GLMModel["validations"] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, "err", validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write(".") sys.stdout.flush()
def test_parse_1m_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [(10, 65000, "cH", 30)] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() print "Summary should work with 65k" parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, " num_rows:", "{:,}".format( inspect["num_rows"] ), " num_cols:", "{:,}".format(inspect["num_cols"]) # should match # of cols in header or ?? self.assertEqual( inspect["num_cols"], colCount, "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount), ) self.assertEqual( inspect["num_rows"], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % (inspect["num_rows"], rowCount), ) # we should obey max_column_display column_limits = [25, 25000, 50000] for column_limit in column_limits: inspect = h2o_cmd.runInspect( None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs ) self.assertEqual( len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit) ) for r in range(0, len(inspect["rows"])): # NB: +1 below because each row includes a row header row: #{row} self.assertEqual( len(inspect["rows"][r]), column_limit + 1, "inspect data rows obeys max_column_display = " + str(column_limit), )
def test_hosts_with_a_browser(self): h2b.browseTheCloud() # hang for many hour, so you can play with the browser # FIX!, should be able to do something that waits till browser is quit? if not h2o.browse_disable: time.sleep(500000)
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3) else: h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameList = [ "airlines_88_08_100lines.csv", ] h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
def test_elapsed_time(self): h2b.browseTheCloud() print "The reported time should increment for each node, on every node." for n in range(NODE_NUM): c = h2o.nodes[n].get_cloud() self.assertEqual(c['cloud_healthy'], True) # the node order doesn't match our node order # start with elapsed_time history = 0 etime = [ 0 for i in range(NODE_NUM)] # loop checking delapsed time increments def check_and_update_etime(): for n in range(NODE_NUM): c = h2o.nodes[n].get_cloud() for i in range(NODE_NUM): t = c['nodes'][i]['elapsed_time'] n = c['nodes'][i]['name'] h = c['nodes'][i]['node_healthy'] print "Current elapsed_time: %s for %s" % (t, n) if t < etime[i]: msg="Current elapsed_time: %s at %s is not > its last polled elapsed_time %s" % (t, n, etime[i]) etime[i] = t self.assertEqual(h, True) for j in range(10): time.sleep(2) check_and_update_etime()
def test_B_putfile_files(self): timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameList = [ ("covtype.data", 'UCI/UCI-large/covtype/covtype.data', 1), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, csvPathname, trees) in csvFilenameList: parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, timeoutSecs=500, schema='put') print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect2 = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) sys.stdout.write('.') sys.stdout.flush()
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameList = [ "airlines_88_08_100lines.csv", ] h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration for csvFilename in csvFilenameList: csvPathname = "datasets/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', timeoutSecs=1000) print csvFilename, 'parse time:', parseResult['response']['time'] print "parse result:", parseResult['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRF(trees=1, parseResult=parseResult, timeoutSecs=2000)
def test_exec_import_hosts_bigfiles(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 4000 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # Update: need unique key names apparently. can't overwrite prior parse output key? # replicating lines means they'll get reparsed. good! (but give new key names) csvFilenameList = [ ("covtype.data", "c"), ("covtype20x.data", "c20"), ("covtype200x.data", "c200"), ("billion_rows.csv.gz", "b"), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename exec_list(exprList, lenNodes, csvFilename, key2)
def test_exec_import_hosts_bigfiles(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 4000 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # Update: need unique key names apparently. can't overwrite prior parse output key? # replicating lines means they'll get reparsed. good! (but give new key names) csvFilenameList = [ ("covtype.data", "c"), ("covtype20x.data", "c20"), ("covtype200x.data", "c200"), ("billion_rows.csv.gz", "b"), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename exec_list(exprList, lenNodes, csvFilename, key2)
def test_RF_poker_311M(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'new-poker-hand.full.311M.txt.gz' for trials in range(2): parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=5, depth=5, parseKey=parseKey, timeoutSecs=600, retryDelaySecs=10.0) print "RF end on ", csvFilename, 'took', time.time( ) - start, 'seconds'
def test_1(self): h2b.browseTheCloud() csvFilename = "airlines_all.csv" csvPathname = 'airlines/airlines_all.csv' h2o.beta_features = True hex_key = csvFilename + ".hex" start = time.time() timeoutSecs = 1200 # airlines_hex = h2i.import_parse(bucket='/home/0xdiag/datasets', path=csvPathname, schema='local', hex_key=hex_key, # timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60, doSummary=False) # print "fv.parse done in ",(time.time()-start) # kwargs = { # 'ignored_cols':'DepTime,ArrTime,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', # 'standardize': 1, # 'classification': 1, # 'response': 'IsDepDelayed', # 'family': 'binomial', # 'n_folds': 0, # 'max_iter': 50, # 'beta_epsilon': 1e-4, # 'lambda':1e-5 # } # results = [] # for i in range(5): # start = time.time() # glm = h2o_cmd.runGLM(parseResult=airlines_hex, timeoutSecs=timeoutSecs, **kwargs) # auc = glm['glm_model']['submodels'][0]['validation']['auc'] # results.append('glm2(%d) done in %d,auc=%f' %(i,(time.time()-start),auc)) # for s in results: # print s while 1: time.sleep(500000) print '.'
def test_elapsed_time(self): h2b.browseTheCloud() print "The reported time should increment for each node, on every node." for n in range(NODE_NUM): c = h2o.nodes[n].get_cloud() self.assertEqual(c['cloud_healthy'], True) # the node order doesn't match our node order # start with elapsed_time history = 0 etime = [0 for i in range(NODE_NUM)] # loop checking delapsed time increments def check_and_update_etime(): for n in range(NODE_NUM): c = h2o.nodes[n].get_cloud() for i in range(NODE_NUM): t = c['nodes'][i]['elapsed_time'] n = c['nodes'][i]['name'] h = c['nodes'][i]['node_healthy'] print "Current elapsed_time: %s for %s" % (t, n) if t < etime[i]: msg = "Current elapsed_time: %s at %s is not > its last polled elapsed_time %s" % ( t, n, etime[i]) etime[i] = t self.assertEqual(h, True) for j in range(10): time.sleep(2) check_and_update_etime()
def test_putfile_a5m(self): timeoutSecs = 500 csvFilenameList = [ # use different names for each parse # doesn't fail if gzipped? ("a5m.csv", 'A', None), ("a5m.csv", 'B', None), ("a5m.csv", 'C', None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, key, trees) in csvFilenameList: csvPathname = csvFilename # creates csvFilename and csvFilename.hex keys parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_1(self): h2b.browseTheCloud() csvFilename = "airlines_all.csv" csvPathname='airlines/airlines_all.csv' hex_key = csvFilename + ".hex" start = time.time() timeoutSecs=1200 # airlines_hex = h2i.import_parse(bucket='/home/0xdiag/datasets', path=csvPathname, schema='local', hex_key=hex_key, # timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60, doSummary=False) # print "fv.parse done in ",(time.time()-start) # kwargs = { # 'ignored_cols':'DepTime,ArrTime,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', # 'standardize': 1, # 'classification': 1, # 'response': 'IsDepDelayed', # 'family': 'binomial', # 'n_folds': 0, # 'max_iter': 50, # 'beta_epsilon': 1e-4, # 'lambda':1e-5 # } # results = [] # for i in range(5): # start = time.time() # glm = h2o_cmd.runGLM(parseResult=airlines_hex, timeoutSecs=timeoutSecs, **kwargs) # auc = glm['glm_model']['submodels'][0]['validation']['auc'] # results.append('glm2(%d) done in %d,auc=%f' %(i,(time.time()-start),auc)) # for s in results: # print s while 1: time.sleep(500000) print '.'
def test_parse_200k_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (10, 100000, 'cA', 200, 200), (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult['destination_key'], "took", time.time( ) - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_dead_node_status(self): # view logs using each node h2b.browseTheCloud() for h in h2o.nodes: h.log_view() # terminate node 1 h2o.nodes[1].terminate_self_only() # remember which is [1] so we can check cloud state correctly badPort = "/" + str(h2o.nodes[1].http_addr) + ":" + str(h2o.nodes[1].port) nodeList = h2o.nodes[:] # copy del nodeList[1] # 1 is dead now print "We probably need some status to interrogate to understand a node is in red state?" print "And I probably need to wait 60 secs to get to red state" time.sleep(120) # h2o.verify_cloud_size(nodeList, verbose=True, ignoreHealth=True) # time.sleep(5) # h2o.verify_cloud_size(nodeList, verbose=True, ignoreHealth=True) # time.sleep(5) # h2o.verify_cloud_size(nodeList, verbose=True, ignoreHealth=True) # just check that node_healthy' goes 'false' on that node # and 'cloud_healthy' goes false # everyone should see the same stuff (0 and 2, 1 won't respond) for n in (0,2): c = h2o.nodes[n].get_cloud() # the node order doesn't match our node order for i in range(3): expected = c['nodes'][i]['name']!=badPort self.assertEqual(c['nodes'][i]['node_healthy'], expected) self.assertEqual(c['cloud_healthy'], False, msg="node %s shouldn't think the cloud is healthy: %s" % (n, c['cloud_healthy']))
def test_rf_kddcup_1999(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'kddcup_1999.data.gz' print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\ "compared to running with the parameters specified and matching the browser RF query defaults. " +\ "Also run the param for full scoring vs OOBE scoring." parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=300) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) for trials in range(4): print "\n" + csvFilename, "Trial #", trials start = time.time() kwargs = { 'response_variable': 'classifier', 'ntree': 200, 'gini': 1, 'class_weights': None, 'stratify': 0, # 'features': None, 'features': 7, 'ignore': None, 'sample': 67, 'bin_limit': 1024, 'depth': 2147483647, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': None, } if trials == 0: kwargs = {} elif trials == 1: kwargs['out_of_bag_error_estimate'] = None elif trials == 2: kwargs['out_of_bag_error_estimate'] = 0 elif trials == 3: kwargs['out_of_bag_error_estimate'] = 1 start = time.time() RFview = h2o_cmd.runRFOnly(trees=50, parseKey=parseKey, timeoutSecs=300, retryDelaySecs=1.0, **kwargs) print "RF end on ", csvFilename, 'took', time.time( ) - start, 'seconds' h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "TEST-poker1000.csv", "leads.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", # these can't RF ..output classes not integer? # "bestbuy_test.csv", # "bestbuy_train.csv", "covtype.data", "covtype.4x.shuffle.data", "covtype4x.shuffle.data", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", # "prostate_2g.csv", # "prostate_long.csv.gz", "prostate_long_1G.csv", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", # "poker_c1s1_testing_refresh.csv", # "3G_poker_shuffle", # "billion_rows.csv.gz", # "poker-hand.1244M.shuffled311M.full.txt", ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
def test_putfile_a5m(self): timeoutSecs = 500 csvFilenameList = [ # use different names for each parse # doesn't fail if gzipped? ("a5m.csv", 'A', None), ("a5m.csv", 'B', None), ("a5m.csv", 'C', None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, key, trees) in csvFilenameList: csvPathname = csvFilename # creates csvFilename and csvFilename.hex keys parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRF(trees=trees,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_from_import_fvec(self): print "Sets h2o.beat_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True # this will redirect import and parse to the 2 variants importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) if not h2o.beta_features: print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseKey['destination_key'], timeoutSecs=30) if not h2o.beta_features: RFview = h2o_cmd.runRFOnly(trees=1, depth=25, parseKey=parseKey, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this # FIX! currently the importFolderResult is empty for fvec if 1 == 0: h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) sys.stdout.write('.') sys.stdout.flush()
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 # ("covtype20x.data", "cD", 50, 20), # ("covtype200x.data", "cE", 50, 200), csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( lenNodes, exprList, key2, minCol=0, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=10, use_flatfile=True) else: import h2o_hosts h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=10,use_flatfile=True) else: import h2o_hosts h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if localhost: h2o.build_cloud(1) else: h2o_hosts.build_cloud_with_hosts(1) h2b.browseTheCloud()
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=7) else: h2o_hosts.build_cloud_with_hosts(java_heap_GB=10) h2b.browseTheCloud()
def test_GLM2_tnc3_10(self): h2o.beta_features = True csvFilename = 'tnc3_10.csv' print "\n" + csvFilename hex_key = "tnc3.hex" h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1==0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseResult # in any case, the destination_key in parseResult was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1==0): colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1==1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_tnc3_ignore(self): csvFilename = 'tnc3_10.csv' csvPathname = h2o.find_file('smalldata/' + csvFilename) print "\n" + csvPathname key2 = "tnc3.hex" h2b.browseTheCloud() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10) print "Parse result['Key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1==0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): start = time.time() kwargs = {'y': 13, 'num_cross_validation_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseKey # in any case, the destination_key in parseKey was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1==0): colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1==1): start = time.time() kwargs = {'y': 13, 'num_cross_validation_folds': 6} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_hdfs_YearPredictionMSD(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir h2i.setupImportHdfs() parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=60) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=500, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def setUpClass(cls): # fails with 3 localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3, java_heap_GB=4, use_flatfile=True) else: h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def test_B_importFolder_files(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "billion_rows.csv.gz", csvFilenameAll = [ # quick test first "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, pollTimeoutSecs=60) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 # RF seems to get memory allocation errors on single machine (16GB dram) ### RFview = h2o_cmd.runRFOnly(trees=1,depth=5,parseKey=parseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") # now some GLm kwargs = {'x': 0, 'y': 1, 'num_cross_validation_folds': 0, 'case_mode': '=', 'case': 1} # one coefficient is checked a little more colX = 0 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) sys.stdout.write('\n.') sys.stdout.flush()
def setUpClass(cls): global SEED, localhost SEED = h2o.setup_random_seed() localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=4,use_flatfile=True) else: h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def test_parse_500_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 500, 'cA', 1800, 1800), ] h2b.browseTheCloud() for (rowCount, colCount, orig_hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename # create sym links multifile = 1000 # there is already one file. assume it's the "0" case for p in range(1, multifile): csvPathnameLink = csvPathname + "_" + str(p) os.symlink(csvFilename, csvPathnameLink) print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) for trial in range(10): hex_key = orig_hex_key + str(trial) start = time.time() parseResult = h2i.import_parse(path=csvPathname + "*", schema='local', hex_key=hex_key, delete_on_done=1, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount * multifile, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount * multifile))
def setUpClass(cls): # fails with 3 global localhost localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(1) else: h2o_hosts.build_cloud_with_hosts(1) h2b.browseTheCloud()
def setUpClass(cls): # fails with 3 localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3,java_heap_GB=4,use_flatfile=True) else: h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def tear_down_cloud(nodeList=None, sandboxIgnoreErrors=False, force=False): if h2o_args.sleep_at_tear_down: print "Opening browser to cloud, and sleeping for 3600 secs, before cloud teardown (for debug)" import h2o_browse as h2b h2b.browseTheCloud() sleep(3600) if not nodeList: nodeList = h2o_nodes.nodes # this could fail too. Should this be set by -uc/--usecloud? or command line argument if nodeList and nodeList[0].delete_keys_at_teardown: start = time.time() h2i.delete_keys_at_all_nodes(timeoutSecs=300) elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs" # could the nodeList still be empty in some exception cases? Assume not for now # FIX! don't send shutdown if we're using an existing cloud # also, copy the "delete keys at teardown from testdir_release # Assume there's a last "test" that's run to shutdown the cloud # don't tear down with -ccj either # FIX! what about usecloud or cloud_cloud_json params from build_cloud time? if force or not (h2o_args.usecloud or h2o_args.clone_cloud_json): try: # update: send a shutdown to all nodes. # h2o maybe doesn't progagate well if sent to one node # the api watchdog shouldn't complain about this? # just send one? # for n in nodeList: # n.shutdown_all() h2o_nodes.nodes[0].shutdown_all() except: pass # ah subtle. we might get excepts in issuing the shutdown, don't abort out # of trying the process kills if we get any shutdown exception (remember we go to all nodes) # so we might? nodes are shutting down? # FIX! should we wait a bit for a clean shutdown, before we process kill? # It can take more than 1 sec though. try: time.sleep(2) for n in nodeList: n.terminate() verboseprint("tear_down_cloud n:", n) except: pass check_sandbox_for_errors(sandboxIgnoreErrors=sandboxIgnoreErrors, python_test_name=h2o_args.python_test_name) # get rid of all those pesky line marker files. Unneeded now clean_sandbox_doneToLine() nodeList[:] = [] h2o_nodes.nodes = []
def setUpClass(cls): global SEED, localhost SEED = h2o.setup_random_seed() localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=4, use_flatfile=True) else: h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] else: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype20x.data", "cD", 50, 20), ("covtype200x.data", "cE", 50, 200), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_from_import(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_cols_enum_multi_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (300, 100, 'cA', 60, '*x[2-5]*'), (310, 200, 'cB', 60, '*x[1,3-5]*'), (320, 300, 'cC', 60, '*x[1-2,4-5]*'), (330, 400, 'cD', 60, '*x[1-3-5]*'), (340, 500, 'cE', 60, '*x[1-4]*'), ] h2b.browseTheCloud() cnum = 0 # create them all first for (rowCount, colCount, key2, timeoutSecs, excludePattern) in tryList: cnum += 1 # FIX! should we add a header to them randomly??? print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) for fileN in range(FILENUM): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList) for (rowCount, colCount, key2, timeoutSecs, excludePattern) in tryList: cnum += 1 # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. h2o.nodes[0].import_files(SYNDATASETS_DIR) # pattern match all, then use exclude parseKey = h2o.nodes[0].parse('*', key2=key2, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs) print "parseKey['destination_key']: " + parseKey['destination_key'] print 'parse time:', parseKey['response']['time'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # FIX! h2o strips one of the headers, but treats all the other files with headers as data num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + parseKey['destination_key'] + ":", \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) # all should have rowCount rows (due to the excludePattern self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \ (num_rows, rowCount, FILENUM)))
def setUpClass(cls): # fails with 3 global local_host local_host = not 'hosts' in os.getcwd() if (local_host): h2o.build_cloud(3,java_heap_GB=4,use_flatfile=True) else: h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=1) else: h2o_hosts.build_cloud_with_hosts(node_count=1) global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() h2b.browseTheCloud()
def test_rf_predict_fvec(self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() h2o.beta_features = True trees = 6 timeoutSecs = 20 hex_key = "iris2.csv.hex" parseResult = h2i.import_parse(bucket="smalldata", path="iris/iris2.csv", schema="put", hex_key=hex_key) h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, destination_key="iris_rf_model", timeoutSecs=timeoutSecs) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key. Inspect/Summary result" start = time.time() predict = h2o.nodes[0].generate_predictions( model_key="iris_rf_model", data_key=hex_key, prediction="predict.hex" ) print "generate_predictions end on ", hex_key, " took", time.time() - start, "seconds" print "predict:", h2o.dump_json(predict) csvPredictPathname = SYNDATASETS_DIR + "/" + "iris2.predict.csv" h2o.nodes[0].csv_download(src_key="predict.hex", csvPathname=csvPredictPathname) inspect = h2o_cmd.runInspect(key="predict.hex") print "inspect:", h2o.dump_json(inspect) # print h2o.dump_json(predict) # no min/max any more with enums? expectedCols = { # "max": 2.0, # "mean": 1.0, # "min": 0.0, "naCnt": 0, # "name": 0, # Enum or real? # "type": "Real", } predictCols = inspect["cols"][0] diffKeys = [k for k in expectedCols if predictCols[k] != expectedCols[k]] for k in diffKeys: raise Exception( "Checking H2O summary results, wrong %s: %s, should be: %s" % (k, predictCols[k], expectedCols[k]) ) expected = { "numRows": 150, "numCols": 4, # "byteSize": 2843, } diffKeys = [k for k in expected if inspect[k] != expected[k]] print "diffKeys", diffKeys for k in diffKeys: raise Exception("%s : %s != %s" % (k, inspect[k], expected[k]))
def test_GLM_catdata_hosts(self): # these are still in /home/kevin/scikit/datasets/logreg # FIX! just two for now.. csvFilenameList = [ "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz" ] # pop open a browser on the cloud h2b.browseTheCloud() # save the first, for all comparisions, to avoid slow drift with each iteration validations1 = {} for csvFilename in csvFilenameList: csvPathname = h2o.find_file('smalldata/' + csvFilename) # I use this if i want the larger set in my localdir # csvPathname = h2o.find_file('/home/kevin/scikit/datasets/logreg/' + csvFilename) print "\n" + csvPathname start = time.time() # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included # 0 by itself is okay? kwargs = { 'y': 7, 'x': '1,2,3,4,5,6', 'family': "binomial", 'n_folds': 3, 'lambda': 1e-4 } timeoutSecs = 200 glm = h2o_cmd.runGLM(csvPathname=csvPathname, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 6, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] print validationsList validations = validationsList[0] # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) sys.stdout.write('.') sys.stdout.flush()
def test_hdfs_multi_bad_csv(self): print "\nUse the new regex capabilities for selecting hdfs: try *csv* at /datasets" # pop open a browser on the cloud h2b.browseTheCloud() # defaults to /datasets h2i.setupImportHdfs() parseKey = h2o.nodes[0].parse('*airlines_all*csv', key2='random_csv.hex', exclude=None, header=None, timeoutSecs=600) print "*csv* regex to hdfs /datasets", 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] sys.stdout.flush()
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_multi_with_a_browser(self): h2b.browseTheCloud() # csvPathname = '../smalldata/poker/poker1000' # h2o_cmd.runRF(trees=10000, timeoutSecs=300, csvPathname=csvPathname) # h2b.browseJsonHistoryAsUrlLastMatch("RFView") # browseJsonHistoryAsUrl() # hang for many hour, so you can play with the browser # FIX!, should be able to do something that waits till browser is quit? if not h2o.browse_disable: time.sleep(500000)
def check_cloud_and_setup_next(): h2b.browseTheCloud() h2o.verify_cloud_size() h2o.check_sandbox_for_errors() print "Tearing down cloud of size", len(h2o.nodes) h2o.tear_down_cloud() h2o.clean_sandbox() # wait to make sure no sticky ports or anything os-related # so let's expand the delay if larger number of jvms # 1 second per node seems good h2o.verboseprint("Waiting", node_count, "seconds to avoid OS sticky port problem") time.sleep(node_count)
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (csvFilename, key2, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_rf_airlines_2013_fvec(self): h2o.beta_features = True h2b.browseTheCloud() csvFilename = 'year2013.csv' hex_key = 'year2013.hex' importFolderPath = 'airlines' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False) parse_time = time.time() - start print "parse took {0} sec".format(parse_time) start = time.time() start = time.time() # noise=['JStack','cpu','disk']) h2o_cmd.runSummary(key=hex_key, timeoutSecs=200) elapsed = time.time() - start print "summary took {0} sec".format(elapsed) trees = 10 paramsTrainRF = { 'ntrees': trees, 'max_depth': 20, 'nbins': 200, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', 'timeoutSecs': 14800, } kwargs = paramsTrainRF.copy() start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n" + l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)