def test_GLM_mnist_s3n_fvec(self): csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] importFolderPath = "mnist" csvPathname = importFolderPath + "/*" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=120) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: # PARSE test**************************************** csvPathname = importFolderPath + "/" + testCsvFilename testHexKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=testHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # PARSE train**************************************** csvPathname = importFolderPath + "/" + trainCsvFilename trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=trainHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'response': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_GLM_params_rand2_newargs(self): csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() y = 54 print "Want to see if there are constant columns" goodX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "goodX:", goodX # intermittent fail on the forced params? for trial in range(10 if DO_FAIL_ONLY else 20): if DO_FAIL_ONLY: params = define_params_fail() else: # params is mutable. This is default. params = {'y': y, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1} h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_c7_rel(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=True) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 if DO_GLM: start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False)
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs) print h2o.dump_json(importResult) storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30) # print h2o.dump_json(storeViewResult) hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_c7_rel(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + csvFilename, " num_rows:", "{:,}".format(num_rows), " num_cols:", "{:,}".format(num_cols) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=num_cols, numRows=num_rows, max_column_display=2500) # it's in runSummary! # h2o_cmd.infoFromSummary(summaryResult, noPrint=False, numCols=num_cols, numRows=num_rows) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 if DO_GLM: start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def run_glms(file, configs): output = None if not os.path.exists('glmbench.csv'): output = open('glmbench.csv', 'w') output.write(','.join(csv_header) + '\n') else: output = open('glmbench.csv', 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') # header! # csvWrt.writerow(dict((fn,fn) for fn in csv_header)) csvWrt.writeheader() try: java_heap_GB = h2o.nodes[0].java_heap_GB k = parse_file(file) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=configs[0]['y'], key=k, timeoutSecs=300) for kwargs in configs: start = time.time() res = h2o.nodes[0].GLM(k, timeoutSecs=6000000, pollTimeoutSecs=180, **kwargs) wall_clock_secs = time.time() - start glm = res['GLMModel'] print "glm model time (milliseconds):", glm['model_time'] print "glm validations[0] time (milliseconds):", glm[ 'validations'][0]['val_time'] print "glm lsm time (milliseconds):", glm['lsm_time'] print 'glm computation time', res['computation_time'] coefs = glm['coefficients'] print 'wall clock in', wall_clock_secs, 'secs' max_len = 0 val = glm['validations'][0] row = {'time': time.asctime(), 'nodes#': len(h2o.nodes)} row.update(kwargs) row.update(glm) row.update(val) row.update({'wall_clock_secs': wall_clock_secs}) row.update({'java_heap_GB': java_heap_GB}) csvWrt.writerow(row) h2o.nodes[0].remove_key(k) finally: output.close()
def test_parse_summary_manyfiles_s3_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [("manyfiles-nflx-gz", 800)] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: # change to 50 files csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs ) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, ) elapsed = time.time() - start print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_storeview_import(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_airline_s3(self): h2o.beta_features = True csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_short(self): csvFilename = 'part-00000b' ### csvFilename = 'short' importFolderPath = '/home/hduser/data' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, separator=9) print "Parse of", parseKey['destination_key'], "took", time.time() - start, "seconds" print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=500) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_GLM2_many_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (2, 100, 'cA', 300), # (4, 200, 'cA', 300), # (10000, 1000, 'cB', 300), # (10000, 3000, 'cC', 500), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount # normally we dno't create x and rely on the default # create the big concat'ed x like the browser, to see what happens # x = ','.join(map(str, range(colCount))) kwargs = { 'response': 'C' + str(y), 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, } start = time.time() x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnStringX=False) # all-zero/all-na cols are dropped. figure out expected # of coefficients glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) expectedCoeffNum = len(x) # check that the number of entries in coefficients is right (intercept is in there) actualCoeffNum = len(glm['glm_model']['submodels'][0]['beta']) - 1 if actualCoeffNum!=expectedCoeffNum: raise Exception("Should be %s expected coefficients in result." % expectedCoeffNum)
def test_parse_summary_airline_s3(self): csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_manyfiles_1_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_1.dat.gz" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 542) self.assertEqual(numRows, 100000) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** # pass numRows, so we know when na cnt means row is all na's summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360, numCols=numCols, numRows=numRows) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def run_glms(file, configs): output = None if not os.path.exists("glmbench_gaussian"): output = open("glmbench_gaussian", "w") output.write(",".join(csv_header) + "\n") else: output = open("glmbench_gaussian", "a") csvWrt = csv.DictWriter( output, fieldnames=csv_header, restval=None, dialect="excel", extrasaction="ignore", delimiter="," ) # header! # csvWrt.writerow(dict((fn,fn) for fn in csv_header)) csvWrt.writeheader() try: java_heap_GB = h2o.nodes[0].java_heap_GB k = parse_file(file) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=configs[0]["y"], key=k, timeoutSecs=300) for kwargs in configs: start = time.time() res = h2o.nodes[0].GLM(k, timeoutSecs=6000000, pollTimeoutSecs=180, **kwargs) wall_clock_secs = time.time() - start glm = res["GLMModel"] print "glm model time (milliseconds):", glm["model_time"] print "glm validations[0] time (milliseconds):", glm["validations"][0]["val_time"] print "glm lsm time (milliseconds):", glm["lsm_time"] print "glm computation time", res["computation_time"] coefs = glm["coefficients"] print "wall clock in", wall_clock_secs, "secs" max_len = 0 val = glm["validations"][0] row = {"time": time.asctime(), "nodes#": len(h2o.nodes)} row.update(kwargs) row.update(glm) row.update(val) row.update({"wall_clock_secs": wall_clock_secs}) row.update({"java_heap_GB": java_heap_GB}) csvWrt.writerow(row) h2o.nodes[0].remove_key(k) finally: output.close()
def run_glms(file,configs): output = None if not os.path.exists('glmbench.csv'): output = open('glmbench.csv','w') output.write(','.join(csv_header)+'\n') else: output = open('glmbench.csv','a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') # header! # csvWrt.writerow(dict((fn,fn) for fn in csv_header)) csvWrt.writeheader() try: java_heap_GB = h2o.nodes[0].java_heap_GB k = parse_file(file) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=configs[0]['y'], key=k, timeoutSecs=300) for kwargs in configs: start = time.time() res = h2o.nodes[0].GLM(k, timeoutSecs=6000000, pollTimeoutSecs=180, **kwargs) wall_clock_secs = time.time() - start glm = res['GLMModel'] print "glm model time (milliseconds):", glm['model_time'] print "glm validations[0] time (milliseconds):", glm['validations'][0]['val_time'] print "glm lsm time (milliseconds):", glm['lsm_time'] print 'glm computation time',res['computation_time'] coefs = glm['coefficients'] print 'wall clock in', wall_clock_secs, 'secs' max_len = 0 val = glm['validations'][0] row = {'time':time.asctime(),'nodes#':len(h2o.nodes)} row.update(kwargs) row.update(glm) row.update(val) row.update({'wall_clock_secs': wall_clock_secs}) row.update({'java_heap_GB': java_heap_GB}) csvWrt.writerow(row) h2o.nodes[0].remove_key(k) finally: output.close()
def test_speedrf_mnist(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("train.csv.gz", "test.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 784 # last column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) ntrees = 10 params = { 'response': y, 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'SpeeDRF_model', 'nbins': 1024, 'seed': 784834182943470027, 'oobee': 1, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) rfv["drf_model"] = rfv.pop("speedrf_model") h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] print "Total trees: ", used_trees print "On data key: ", data_key print "Produced model key: ", model_key
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack','iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 csvFilenameList = [ (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): bucket = "home-0xdiag-datasets" ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: if USE_S3: protocol = "s3" else: protocol = "s3n" print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" # jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o.init(h2oPerNode, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10) # java_extra_args=jea, # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandboxIgnoreErrors = True for trial in range(trialMax): # import a list of folders, one at a time (hdfs import can't take pattern match # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket # too slow for csvFolder in csvFolderList: # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='s3') else: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='hdfs') foundKeys = 0 for s in importResult['succeeded']: # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in s['key']: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", s['key'] break else: pass foundKeys += 1 ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?") src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse2Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] src_key = URI + csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse3Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) y = 378 if not noPoll: x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]: x.remove(i) x = ",".join(map(str,x)) if DO_GLM: algo = 'GLM' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.checkKeyDistribution() h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_c7_rel(self): print "Running with h2o.beta_features=True for all" h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset # does the json fail with too many?? #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2) # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500) # can't do more than 1000 summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect if DO_INSPECT: x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x else: x = None kwargs = { # 'x': x, 'response': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, # 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) # can't figure out how I'm supposed to get the model # GLMModel = glm['GLMModel'] # modelKey = GLMModel['model_key'] # glmView = h2o.nodes[0].glm_view(modelKey=modelKey) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_many_fp_formats_libsvm_2 (self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), (100, 40000, 'cC', 300, 'sparse50'), (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0,47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs, doSummary=False) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseKey['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset ### print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = inspect['cols'][k]['mean'] # our fp formats in the syn generation sometimes only have two places? self.assertAlmostEqual(mean, synMean, places=0, msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) num_missing_values = inspect['cols'][k]['num_missing_values'] self.assertEqual(0, num_missing_values, msg='col %s num_missing_values %d should be 0' % (k, num_missing_values))
def test_RF_mnist_both(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** # print "This is the 'ignore=' we'll use" # no longer use. depend on h2o to get it right. ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntree = 25 params = { 'response_variable': 0, # 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, # 'seed': 784834182943470027, 'use_non_local_data': 1, # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error) leaves = rfView['trees']['leaves'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100 d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = rfView['trees']['depth'] depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100 d = "seed: %s %s depth: %s expected: %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_c10_glm_fvec(self): h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' csvFilename = 'classification1Train.txt' csvPathname = importFolderPath + "/" + csvFilename start = time.time() # hack. force it to NA the header, so we have col names that are not customer senstive below parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False, header=0) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices ignore_x = [] x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] for i in range(numCols): if i not in x: ignore_x.append(i) # since we're no long zero based, increment by 1 ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) # GLM Train*********************************************************** keepPattern = None y = 0 print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x print "ignore_x:", x kwargs = { 'response': y, 'ignored_cols': ignore_x, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Parse Test*********************************************************** GLMModel = glm['glm_model'] modelKey = GLMModel['_key'] csvFilename = 'classification1Test.txt' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"
def test_GLM1_GLM2_predict(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' y = 54 expectedPctWrong = 0 if 1 == 0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' y = 54 expectedPctWrong = 0 if 1 == 1: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'smalldata' # no header csvPathname = 'iris/iris.csv' hexKey = 'iris.hex' y = 4 expectedPctWrong = 26 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult['destination_key'] # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300) #************************************************************************** # first glm1 CLASS = 1 # try ignoring the constant col to see if it makes a diff kwargs = { 'lsm_solver': LSM_SOLVER, 'standardize': STANDARDIZE, 'y': 'C' + str(y + 1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON, 'case': CLASS, 'case_mode': '=', } timeoutSecs = 120 kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm['GLMModel']['GLMParams']['family'] = FAMILY print "glm1 end on ", csvPathname, 'took', time.time( ) - start, 'seconds' (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm['GLMModel']['iterations'] err1 = glm['GLMModel']['validations'][0]['err'] nullDev1 = glm['GLMModel']['validations'][0]['nullDev'] resDev1 = glm['GLMModel']['validations'][0]['resDev'] if FAMILY == 'binomial': classErr1 = glm['GLMModel']['validations'][0]['classErr'] auc1 = glm['GLMModel']['validations'][0]['auc'] #************************************************************************** # then glm2 kwargs = { # 'ignored_cols': 'C29', 'standardize': STANDARDIZE, 'response': 'C' + str(y + 1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON } timeoutSecs = 120 # class 1=1, all else 0 if FAMILY == 'binomial': execExpr = "B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % ( trainKey, y + 1, trainKey, y + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {'destination_key': 'B.hex'} else: bHack = parseResult kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, 'took', time.time( ) - start, 'seconds' (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) #************************************************************************** modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] iteration = submodels[-1]['iteration'] resDev = validation['residual_deviance'] nullDev = validation['null_deviance'] if FAMILY == 'binomial': auc = validation['auc'] self.assertLess(iterations1, MAX_ITER - 1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER - 1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, # msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected)) iterationExpected = iterations1 # self.assertAlmostEqual(iteration, iterationExpected, delta=2, # msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected)) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % ( 100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected)) self.assertTrue( h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5), msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected)) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % ( 100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected)) self.assertTrue( h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5), msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected)) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == 'binomial': aucExpected = auc1 self.assertAlmostEqual( auc, aucExpected, delta=10, msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected)) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * ( abs(intercept) - abs(interceptExpected)) / abs(interceptExpected) self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5), msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected)) # avg_errExpected = 0.2463 avg_errExpected = err1 # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, # msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected)) # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, # msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35)) #******************** # Print comparison #******************** interceptDelta = abs(abs(intercept1) - abs(intercept)) cDelta = [ abs(abs(a) - abs(b)) for a, b in zip(coefficients1, coefficients) ] def printit(self, a, b, c, d): pctDiff = abs(d / c) * 100 print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \ ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d) # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept") printit(self, "intercept", "", intercept1, interceptDelta) print "compare lengths coefficients1, coefficients, cDelta:", len( coefficients1), len(coefficients), len(cDelta) print "GLM1:", coefficients1 print "GLM2:", coefficients print "cDelta:", cDelta for i, cValue in enumerate(coefficients): printit(self, "coefficient", "C" + str(i), cValue, cDelta[i]) hexKey = 'B.hex' pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=2.0, msg= "predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
def test_GLM2_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (2, 100, 'cA', 300), # (4, 200, 'cA', 300), (10000, 1000, 'cB', 300), (10000, 3000, 'cC', 500), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=90) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount # normally we dno't create x and rely on the default # create the big concat'ed x like the browser, to see what happens # x = ','.join(map(str, range(colCount))) kwargs = { 'response': 'C' + str(y), 'max_iter': 10, 'n_folds': 1, 'alpha': 0.0, 'lambda': 0.0, } start = time.time() x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnStringX=False) # all-zero/all-na cols are dropped. figure out expected # of coefficients glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) expectedCoeffNum = len(x) # check that the number of entries in coefficients is right (intercept is in there) actualCoeffNum = len(glm['glm_model']['submodels'][0]['beta']) - 1 if actualCoeffNum != expectedCoeffNum: raise Exception( "Should be %s expected coefficients in result." % expectedCoeffNum)
def test_GLM_mnist(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_parse_summary_airline_s3n(self): URI = "s3n://h2o-airlines-unpacked/" csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importHDFSResult = h2o.nodes[0].import_hdfs(URI) ### print "importHDFSResult:", h2o.dump_json(importHDFSResult) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList), 8, "Should see more than 8 files in s3n?") if 1 == 0: # slow? print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename s3nKey = URI + csvPathname # PARSE**************************************** key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y='IsArrDelayed', key=parseKey['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** if 1 == 0: # slow print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_GLM2_mnist_short(self): h2o.beta_features = True importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" # first col is pixel value ..use 0 here y = 0 ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, # first column is pixel value 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o_cmd.runSummary(key=trainKey, cols=0, max_ncols=1, noPrint=False) h2o_cmd.runSummary(key='A.hex', cols=0, max_ncols=1, noPrint=False) execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o_cmd.runSummary(key=testKey, cols=0, max_ncols=1, noPrint=False) h2o_cmd.runSummary(key='B.hex', cols=0, max_ncols=1, noPrint=False) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm)
def test_speedrf_mnist(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("train.csv.gz", "test.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 784 # last column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntrees = 10 params = { 'response': y, 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'SpeeDRF_model', 'nbins': 1024, 'seed': 784834182943470027, 'oobee': 1, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) rfv["drf_model"] = rfv.pop("speedrf_model") h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] print "Total trees: ", used_trees print "On data key: ", data_key print "Produced model key: ", model_key
def test_c10_glm_fvec(self): print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' csvFilename = 'classification1Train.txt' csvPathname = importFolderPath + "/" + csvFilename start = time.time() # hack. force it to NA the header, so we have col names that are not customer senstive below parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False, header=0) print "Parse of", parseResult['destination_key'], "took", time.time( ) - start, "seconds" print "Parse result['destination_key']:", parseResult[ 'destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices y = 0 ignore_x = [] x = [ 6, 7, 8, 10, 12, 31, 32, 33, 34, 35, 36, 37, 40, 41, 42, 43, 44, 45, 46, 47, 49, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70 ] for i in range(numCols): if i not in x and i != y: ignore_x.append(i) # since we're no long zero based, increment by 1 ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x)) # GLM Train*********************************************************** keepPattern = None print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x print "ignore_x:", x kwargs = { 'response': y, 'ignored_cols': ignore_x, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Parse Test*********************************************************** GLMModel = glm['glm_model'] modelKey = GLMModel['_key'] csvFilename = 'classification1Test.txt' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time( ) - start, "seconds"
def test_rf_mnist_both_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "Not using ignore from this..have to adjust cols?" h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntree = 2 params = { 'response': 'C1', # 'ignored_cols_by_name': ignore_x, 'ntrees': ntree, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 100, 'importance': 0, 'balance_classes': 0, } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print 'rfView:', h2o.dump_json(rfView) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['drf_model']['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) # training and test data are unique, so error won't be low? # self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error) leaves = { 'min': rfView['drf_model']['treeStats']['minLeaves'], 'mean': rfView['drf_model']['treeStats']['meanLeaves'], 'max': rfView['drf_model']['treeStats']['maxLeaves'], } # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 537, 'mean': 1118.05, 'max': 1701} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100 d = "seed: %s leaves %s %s %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = { 'min': rfView['drf_model']['treeStats']['minDepth'], 'mean': rfView['drf_model']['treeStats']['meanDepth'], 'max': rfView['drf_model']['treeStats']['maxDepth'], } depthExpected = {'min': 20, 'mean': 20, 'max': 20} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100 d = "seed: %s depth %s %s %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_RF_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, schema='local', hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) params = { 'response': 'C' + str(y), 'cols': None, 'ignored_cols_by_name': ignore_x, 'classification': 1, 'validation': None, 'ntrees': 10, 'max_depth': 20, 'min_rows': None, 'nbins': 1000, 'mtries': None, 'sample_rate': 0.66, 'seed': None, } rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow params['destination_key'] = 'RFModel_' + str('jobDispatch') kwargs = params.copy() timeoutSecs = 1200 start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, rfView=DO_POLL, **kwargs) elapsed = time.time() - start # print h2o.dump_json(rfResult) print "rf job dispatch end on ", trainCsvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # FIX! are these already in there? rfView = {} rfView['data_key'] = trainKey2 rfView['model_key'] = kwargs['destination_key'] rfView['ntrees'] = kwargs['ntrees'] rfViewInitial.append(rfView) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) # FIX! need to add the rfview and predict stuff # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntrees = rfView['ntrees'] rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=not DO_POLL, doSimpleCheck=False) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) self.assertAlmostEqual( classification_error, 10, delta=2, msg="Classification error %s differs too much" % classification_error) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5) # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False) # print "rfView:", h2o.dump_json(rfView) # "N":1, # "errs":[0.25,0.1682814508676529], # "testKey":"syn_binary_10000x10.hex", # "cm":[[3621,1399],[1515,3465]]}} rf_model = rfView['drf_model'] cms = rf_model['cms'] ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] # FIX! should update this expected classification error ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
def test_GLM2_mnist_reals(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x GLM will use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x modelKey = "mnist" params = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey } # for c in [0,1,2,3,4,5,6,7,8,9]: # just do a couple digits for c in [0,7]: print "Trying binomial with case:", c execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) kwargs = params.copy() timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) # Score ********************************************** execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "Problems with test data having different enums than train? just use train for now" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key="B.hex", model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual="B.hex", vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_parse_summary_manyfiles_1_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_1.dat.gz" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 542) self.assertEqual(numRows, 100000) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** # pass numRows, so we know when na cnt means row is all na's summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360, numCols=numCols, numRows=numRows) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_c7_rel(self): print "Running with h2o.beta_features=True for all" h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset # does the json fail with too many?? #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2) # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500) # can't do more than 1000 summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, # 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) # can't figure out how I'm supposed to get the model # GLMModel = glm['GLMModel'] # modelKey = GLMModel['model_key'] # glmView = h2o.nodes[0].glm_view(modelKey=modelKey) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_RF_mnist_both(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** # print "This is the 'ignore=' we'll use" # no longer use. depend on h2o to get it right. ntree = 25 params = { 'response': 0, 'ntrees': ntree, # 'data_key='mnist_training.csv.hex' 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 2147483647, 'select_stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample_rate': 0.67, 'oobee': 1, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'destination_key': 'RF_model', 'nbins': 1024, # 'seed': 784834182943470027, # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0, sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # RFView (score on test)**************************************** (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) # was 2.84 # sometimes get 2.87? self.assertAlmostEqual( classification_error, 1.6, delta=1.6, msg="Classification error %s differs too much" % classification_error) treeStats = rfView['speedrf_model']['treeStats'] leaves = { 'min': treeStats['minLeaves'], 'mean': treeStats['meanLeaves'], 'max': treeStats['maxLeaves'] } # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100 d = "seed: %s %s leaves: %s expected: %s pct. different %s" % ( params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = { 'min': treeStats['minDepth'], 'mean': treeStats['meanDepth'], 'max': treeStats['maxDepth'] } depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100 d = "seed: %s %s depth: %s expected: %s pct. different %s" % ( params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() modelKey = rfView['speedrf_model']['_key'] predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_PCA_manyfiles_fvec(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'PCAModelKey' files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800) ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378] print ignore_x ignored_cols = ",".join(map(lambda x: "C" + str(x), ignore_x)) # for comparison ignore_x = h2o_glm.goodXFromColumnInfo(378, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) print ignore_x # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'ignored_cols': ignored_cols, 'tolerance': tolerance, 'standardize': 1, 'max_pc': None, } print "Using these parameters for PCA: ", params kwargs = params.copy() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout" print "Checking PCA results: " h2o_pca.simpleCheckPCA(self,pcaResult) h2o_pca.resultsCheckPCA(self,pcaResult) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = h2o_cmd.runInspect(key=modelKey) # errrs from end of list? is that the last tree? sdevs = pcaInspect["PCAModel"]["stdDev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["PCAModel"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_RF_mnist_both(self): importFolderPath = "/home/0xdiag/datasets/mnist_repl" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, parsePattern, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 100 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, # 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0, sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual( classification_error, 0.028, delta=0.01, msg="Classification error %s differs too much" % classification_error) leaves = rfView['trees']['leaves'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100 d = "seed: %s leaves %s %s %s pct. different %s" % ( params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = rfView['trees']['depth'] depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100 d = "seed: %s depth %s %s %s pct. different %s" % ( params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary( key=selKey2, max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k, v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k >= 0 and k < len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0) / rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k + 1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json( resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception( 'col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))
def test_many_fp_formats_libsvm(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30, 'sparse'), (100, 100, 'cF', 30, 'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use numCols?. numCols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols)) syn = {} if k == 0: syn['name'] = "C1" syn['type'] = {'Int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn['name'] = "C2" syn['type'] = {'Int'} syn['min'] = 0 syn['max'] = 0 # syn['scale'] = {1} else: syn['name'] = "C" + str(k + 1) syn['type'] = {'Int', 'Real'} syn['min'] = valMin syn['max'] = valMax # syn['scale'] = {1,10,100,1000} syn['naCnt'] = 0 syn['cardinality'] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue( syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue( syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'type': if cols[synKey] not in syn[synKey]: print "cols min/max:", cols['min'], cols['max'] print "syn min/max:", syn['min'], syn['max'] raise Exception( 'col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual( syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_c10_rel_glm(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' csvFilename = 'classification1Train.txt' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices # since we're no long zero based, increment by 1 x_from_zero = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] x = ['C' + str(i + 1) for i in x_from_zero] y = 0 # GLM Train*********************************************************** keepPattern = None # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "from goodX (not used) x:", x print "y:", y # have to use named cols, and they start with 1 kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Parse Test*********************************************************** GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] csvFilename = 'classification1Test.txt' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" # GLMScore Test*********************************************************** start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glmScore end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
def test_GLM1_GLM2_train_pred_fvec(self): h2o.beta_features = False SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 if 1 == 0: bucket = "home-0xdiag-datasets" csvPathname = "standard/covtype.data" hexKey = "covtype.data.hex" y = 54 if 1 == 1: bucket = "home-0xdiag-datasets" csvPathname = "standard/covtype.shuffled.10pct.data" hexKey = "covtype.shuffled.10pct.data.hex" y = 54 if 1 == 0: bucket = "smalldata" # no header csvPathname = "iris/iris.csv" y = 4 predictHexKey = "predict.hex" predictCsv = "predict.csv" execHexKey = "A.hex" execCsv = "exec.csv" csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvExecPathname = SYNDATASETS_DIR + "/" + execCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema="put", returnFullPath=True) def predict_and_compare_csvs(model_key): start = time.time() predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "runPredict end on ", hexKey, " took", time.time() - start, "seconds" h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, "predict.hex") h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} (rowNum1, originalOutput) = compare_csv_last_col( csvExecPathname, msg="Original, after being exec'ed", skipHeader=True ) (rowNum2, predictOutput) = compare_csv_last_col(csvPredictPathname, msg="Predicted", skipHeader=True) # no header on source if rowNum1 != rowNum2: raise Exception( "original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \ %s" % (rowNum1, rowNum2) ) wrong = 0 wrong0 = 0 wrong1 = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): o = float(o) p = float(p) if o != p: msg = ( "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) ) if p == 0.0 and wrong0 == 10: print "Not printing any more predicted=0 mismatches" elif p == 0.0 and wrong0 < 10: print msg if p == 1.0 and wrong1 == 10: print "Not printing any more predicted=1 mismatches" elif p == 1.0 and wrong1 < 10: print msg if p == 0.0: wrong0 += 1 elif p == 1.0: wrong1 += 1 wrong += 1 print "wrong0:", wrong0 print "wrong1:", wrong1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 16.0: raise Exception("pct wrong: %s too high. Expect < 16 pct error" % pctWrong) # ************************************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult["destination_key"] CLASS = 1 # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300) # ************************************************************************** # first glm1 h2o.beta_features = False # try ignoring the constant col to see if it makes a diff kwargs = { "lsm_solver": LSM_SOLVER, "standardize": STANDARDIZE, # 'y': 'C' + str(y), "y": "C" + str(y + 1), "family": FAMILY, "n_folds": 1, "max_iter": MAX_ITER, "beta_epsilon": BETA_EPSILON, } if USE_EXEC: # maybe go back to simpler exec here. this was from when Exec failed unless this was used execExpr = "A.hex=%s" % trainKey h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # class 1=1, all else 0 if FAMILY == "binomial": execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {"destination_key": "A.hex"} else: # since we're not using predict, we can use case_mode/val to get the binomial output class if FAMILY == "binomial": kwargs.update({"case_mode": "=", "case": 1}) aHack = {"destination_key": hexKey} timeoutSecs = 120 kwargs.update({"case_mode": "=", "case": 1}) kwargs.update({"alpha": TRY_ALPHA, "lambda": TRY_LAMBDA}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm["GLMModel"]["GLMParams"]["family"] = FAMILY print "glm1 end on ", csvPathname, "took", time.time() - start, "seconds" (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm["GLMModel"]["iterations"] err1 = glm["GLMModel"]["validations"][0]["err"] nullDev1 = glm["GLMModel"]["validations"][0]["nullDev"] resDev1 = glm["GLMModel"]["validations"][0]["resDev"] if FAMILY == "binomial": classErr1 = glm["GLMModel"]["validations"][0]["classErr"] auc1 = glm["GLMModel"]["validations"][0]["auc"] # ************************************************************************** # then glm2 h2o.beta_features = True kwargs = { # 'ignored_cols': 'C29', "standardize": STANDARDIZE, "classification": 1 if FAMILY == "binomial" else 0, # 'response': 'C' + str(y), "response": "C" + str(y + 1), "family": FAMILY, "n_folds": 1, "max_iter": MAX_ITER, "beta_epsilon": BETA_EPSILON, } timeoutSecs = 120 if USE_EXEC: # maybe go back to simpler exec here. this was from when Exec failed unless this was used execExpr = "B.hex=%s" % trainKey h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # class 1=1, all else 0 if FAMILY == "binomial": execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {"destination_key": "B.hex"} else: # since we're not using predict, we can use case_mode/val to get the binomial output class if FAMILY == "binomial": kwargs.update({"case_mode": "=", "case_val": 1}) bHack = {"destination_key": hexKey} kwargs.update({"alpha": TRY_ALPHA, "lambda": TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, "took", time.time() - start, "seconds" (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # ************************************************************************** modelKey = glm["glm_model"]["_key"] avg_err = glm["glm_model"]["submodels"][0]["validation"]["avg_err"] best_threshold = glm["glm_model"]["submodels"][0]["validation"]["best_threshold"] iteration = glm["glm_model"]["submodels"][0]["iteration"] resDev = glm["glm_model"]["submodels"][0]["validation"]["residual_deviance"] nullDev = glm["glm_model"]["submodels"][0]["validation"]["null_deviance"] if FAMILY == "binomial": auc = glm["glm_model"]["submodels"][0]["validation"]["auc"] self.assertLess(iterations1, MAX_ITER - 1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER - 1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 self.assertAlmostEqual( nullDev, nullDevExpected, delta=2, msg="GLM2 nullDev %s is too different from GLM1 %s" % (nullDev, nullDevExpected), ) iterationExpected = iterations1 self.assertAlmostEqual( iteration, iterationExpected, delta=2, msg="GLM2 iteration %s is too different from GLM1 %s" % (iteration, iterationExpected), ) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % (100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected)) self.assertTrue( h2o_util.approx_equal(coeff0, coeff0Expected, 0.01), msg="GLM2 coefficient 0 %s is too different from GLM1 %s" % (coeff0, coeff0Expected), ) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % (100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected)) self.assertTrue( h2o_util.approx_equal(coeff2, coeff2Expected, 0.01), msg="GLM2 coefficient 2 %s is too different from GLM1 %s" % (coeff2, coeff2Expected), ) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == "binomial": aucExpected = auc1 self.assertAlmostEqual( auc, aucExpected, delta=10, msg="GLM2 auc %s is too different from GLM1 %s" % (auc, aucExpected) ) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * (abs(intercept) - abs(interceptExpected)) / abs(interceptExpected) self.assertTrue( h2o_util.approx_equal(intercept, interceptExpected, 0.01), msg="GLM2 intercept %s is too different from GLM1 %s" % (intercept, interceptExpected), ) # avg_errExpected = 0.2463 avg_errExpected = err1 self.assertAlmostEqual( avg_err, avg_errExpected, delta=0.05 * avg_errExpected, msg="GLM2 avg_err %s is too different from GLM1 %s" % (avg_err, avg_errExpected), ) self.assertAlmostEqual( best_threshold, 0.35, delta=0.01 * best_threshold, msg="GLM2 best_threshold %s is too different from GLM1 %s" % (best_threshold, 0.35), ) predict_and_compare_csvs(model_key=modelKey)
def test_GLM2_mnist(self): if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % ( trainKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % ( testKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 9, "Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM_mnist_reals(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_mnist_s3n(self): URI = "s3n://home-0xdiag-datasets/mnist/" csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] # IMPORT********************************************** importHDFSResult = h2o.nodes[0].import_hdfs(URI) ### print "importHDFSResult:", h2o.dump_json(importHDFSResult) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** s3nKey = URI + testCsvFilename testKey2 = testCsvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, testKey2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # PARSE train**************************************** s3nKey = URI + trainCsvFilename trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, trainKey2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] kwargs = {'x': x, 'y': y, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) print "GLMScore in", (time.time() - start), "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o.verboseprint(h2o.dump_json(glmScore))
def test_RF_mnist(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 10 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_RF_mnist_reals_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntrees = 10 params = { 'response': 'C1', 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 1024, 'seed': 784834182943470027, 'importance': 0, 'balance_classes': 0, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfv = h2o_cmd.runRFView(data_key=testKey2, model_key=model_key, ntrees=ntrees, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfv, **params) self.assertAlmostEqual(classification_error, 9, delta=1.0, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "/home/0xdiag/datasets/standard" csvFilelist = [ ("covtype.data", 300), ] # IMPORT********************************************** # H2O deletes the source key. So re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) # the list could be from hdfs/s3 (ec2 remap) or local. They have to different list structures if 'succeeded' in importFolderResult: succeededList = importFolderResult['succeeded'] elif 'files' in importFolderResult: succeededList = importFolderResult['files'] else: raise Exception("Can't find 'files' or 'succeeded' in import list") ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 3, "Should see more than 3 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** key2 = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseKey['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w") result = json.dump(storeViewResult, f, indent=4, sort_keys=True, default=str) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_GLM1_GLM2_predict(self): # h2b.browseTheCloud() h2o.beta_features = False SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1==0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' y = 54 expectedPctWrong = 0 if 1==0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' y = 54 expectedPctWrong = 0 if 1==1: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'smalldata' # no header csvPathname = 'iris/iris.csv' hexKey = 'iris.hex' y = 4 expectedPctWrong = 26 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult['destination_key'] # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) #************************************************************************** # first glm1 h2o.beta_features = False CLASS = 1 # try ignoring the constant col to see if it makes a diff kwargs = { 'lsm_solver': LSM_SOLVER, 'standardize': STANDARDIZE, 'y': 'C' + str(y+1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON, 'case': CLASS, 'case_mode': '=', } timeoutSecs = 120 kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm['GLMModel']['GLMParams']['family'] = FAMILY print "glm1 end on ", csvPathname, 'took', time.time() - start, 'seconds' (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm['GLMModel']['iterations'] err1 = glm['GLMModel']['validations'][0]['err'] nullDev1 = glm['GLMModel']['validations'][0]['nullDev'] resDev1 = glm['GLMModel']['validations'][0]['resDev'] if FAMILY == 'binomial': classErr1 = glm['GLMModel']['validations'][0]['classErr'] auc1 = glm['GLMModel']['validations'][0]['auc'] #************************************************************************** # then glm2 h2o.beta_features = True kwargs = { # 'ignored_cols': 'C29', 'standardize': STANDARDIZE, 'response': 'C' + str(y+1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON} timeoutSecs = 120 # class 1=1, all else 0 if FAMILY == 'binomial': execExpr="B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % (trainKey, y+1, trainKey, y+1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {'destination_key': 'B.hex'} else: bHack = parseResult kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, 'took', time.time() - start, 'seconds' (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) #************************************************************************** modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] iteration = submodels[-1]['iteration'] resDev = validation['residual_deviance'] nullDev = validation['null_deviance'] if FAMILY == 'binomial': auc = validation['auc'] self.assertLess(iterations1, MAX_ITER-1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER-1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, # msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected)) iterationExpected = iterations1 # self.assertAlmostEqual(iteration, iterationExpected, delta=2, # msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected)) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % (100.0 * (abs(coeff0) - abs(coeff0Expected))/abs(coeff0Expected)) self.assertTrue(h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5), msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected)) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % (100.0 * (abs(coeff2) - abs(coeff2Expected))/abs(coeff2Expected)) self.assertTrue(h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5), msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected)) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == 'binomial': aucExpected = auc1 self.assertAlmostEqual(auc, aucExpected, delta=10, msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected)) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * (abs(intercept) - abs(interceptExpected))/abs(interceptExpected) self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5), msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected)) # avg_errExpected = 0.2463 avg_errExpected = err1 # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, # msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected)) # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, # msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35)) #******************** # Print comparison #******************** interceptDelta = abs(abs(intercept1) - abs(intercept)) cDelta = [abs(abs(a) - abs(b)) for a,b in zip(coefficients1, coefficients)] def printit(self, a, b, c, d): pctDiff = abs(d/c)*100 print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \ ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d) # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept") printit(self, "intercept", "", intercept1, interceptDelta) print "compare lengths coefficients1, coefficients, cDelta:", len(coefficients1), len(coefficients), len(cDelta) print "GLM1:", coefficients1 print "GLM2:", coefficients print "cDelta:", cDelta for i,cValue in enumerate(coefficients): printit(self , "coefficient", "C"+str(i), cValue, cDelta[i]) hexKey = 'B.hex' pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 2.0, msg="predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
def test_many_fp_formats_libsvm (self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30,'sparse'), (100, 100, 'cF', 30,'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0,47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use num_cols?. num_cols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols)) syn = {} if k==0: syn['name'] = "Target" syn['size'] = {1,2} # can be two if we actually used the full range 0-255 (need extra for h2o NA) syn['type'] = {'int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' syn['scale'] = {1} # syn['base'] = 0 # syn['variance'] = 0 elif k==1: # we forced this to always be 0 syn['name'] = "V" + str(k) syn['size'] = {1} syn['type'] = {'int'} syn['min'] = 0 syn['max'] = 0 syn['scale'] = {1} syn['base'] = 0 syn['variance'] = 0 else: syn['name'] = "V" + str(k) syn['size'] = {1,2,4,8} # can be 2, 4 or 8? maybe make this a set for membership check syn['type'] = {'int', 'float'} syn['min'] = valMin syn['max'] = valMax syn['scale'] = {1,10,100,1000} # syn['base'] = 0 # syn['variance'] = 0 syn['num_missing_values'] = 0 syn['enum_domain_size'] = 0 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue(syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue(syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'size' or synKey == 'scale' or synKey == 'type': if cols[synKey] not in syn[synKey]: # for debug of why it was a bad size print "cols size/min/max:", cols['size'], cols['min'], cols['max'] print "syn size/min/max:", syn['size'], syn['min'], syn['max'] raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual(syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "/home/0xdiag/datasets/standard" csvFilelist = [("covtype.data", 300)] # IMPORT********************************************** # H2O deletes the source key. So re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) # the list could be from hdfs/s3 (ec2 remap) or local. They have to different list structures if "succeeded" in importFolderResult: succeededList = importFolderResult["succeeded"] elif "files" in importFolderResult: succeededList = importFolderResult["files"] else: raise Exception("Can't find 'files' or 'succeeded' in import list") ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 3, "Should see more than 3 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** key2 = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs ) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseKey["destination_key"] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseKey["destination_key"], timeoutSecs=360) print "Inspect:", parseKey["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseKey["destination_key"], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w") result = json.dump(storeViewResult, f, indent=4, sort_keys=True, default=str) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_GLM2_mnist(self): h2o.beta_features = True if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y), # 'case_mode': '=', # 'case_val': 0, 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, ## 'weight': 1.0, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0,1,2,3,4,5,6,7,8,9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr="A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr="B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_selfKey'] # This seems wrong..what's the format of the cm? if 1==0: cm = glm['glm_model']['submodels'][0]['validation']['_cms'][0]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM_mnist_s3n(self): URI = "s3n://home-0xdiag-datasets/mnist/" csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] # IMPORT********************************************** importHDFSResult = h2o.nodes[0].import_hdfs(URI) ### print "importHDFSResult:", h2o.dump_json(importHDFSResult) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList), 1, "Should see more than 1 files in s3n?") trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** s3nKey = URI + testCsvFilename testKey2 = testCsvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, testKey2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # PARSE train**************************************** s3nKey = URI + trainCsvFilename trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, trainKey2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] kwargs = {'x': x, 'y': y, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) print "GLMScore in", (time.time() - start), "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o.verboseprint(h2o.dump_json(glmScore))
def test_RF_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 100 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_reals_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_RF_mnist_reals_fvec(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) ntrees = 10 params = { 'response': 'C1', 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 1024, 'seed': 784834182943470027, 'importance': 0, 'balance_classes': 0, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfv = h2o_cmd.runRFView(data_key=testKey2, model_key=model_key, ntrees=ntrees, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfv, **params) self.assertAlmostEqual(classification_error, 9, delta=1.0, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)