def exec_expr_list_across_cols(lenNodes, exprList, keyX, minCol=0, maxCol=54, timeoutSecs=10, incrementingResult=True): colResultList = [] for colX in range(minCol, maxCol): for i, exprTemplate in enumerate(exprList): # do each expression at a random node, to facilate key movement # UPDATE: all execs are to a single node. No mixed node streams # eliminates some store/store race conditions that caused problems. # always go to node 0 (forever?) if lenNodes is None: execNode = 0 else: ### execNode = random.randint(0,lenNodes-1) ### print execNode execNode = 0 execExpr = fill_in_expr_template(exprTemplate, colX, colX, 0, keyX) if incrementingResult: # the Result<col> pattern resultKey = "Result"+str(colX) else: # assume it's a re-assign to self resultKey = keyX # kbn # v1 # execResultInspect = exec_expr(h2o.nodes[execNode], execExpr, resultKey, timeoutSecs) # v2 execResultInspect = exec_expr(h2o.nodes[execNode], execExpr, None, timeoutSecs) print "\nexecResult:", h2o.dump_json(execResultInspect) execResultKey = execResultInspect[0]['key'] # v2: Exec2 'apply' can have no key field? (null) maybe just use keyX then if execResultKey: resultInspect = h2o_cmd.runInspect(None, execResultKey) else: resultInspect = h2o_cmd.runInspect(None, keyX) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # min is keyword. shouldn't use. if incrementingResult: # a col will have a single min min_value = checkScalarResult(execResultInspect, resultKey) h2o.verboseprint("min_value: ", min_value, "col:", colX) print "min_value: ", min_value, "col:", colX else: min_value = None sys.stdout.write('.') sys.stdout.flush() ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation if (h2o.check_sandbox_for_errors()): raise Exception( "Found errors in sandbox stdout or stderr, on trial #%s." % trial) print "Column #", colX, "completed\n" colResultList.append(min_value) return colResultList
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult["destination_key"] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key, ":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! type = inspectGG["type"] if "unparsed" in type: print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now" print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG["response"] # dict ### rows = inspectGG['rows'] value_size_bytes = inspectGG["value_size_bytes"] model0 = glmGridResult["models"][0] alpha = model0["alpha"] area_under_curve = model0["area_under_curve"] error_0 = model0["error_0"] error_1 = model0["error_1"] key = model0["key"] print "best GLM model key:", key glm_lambda = model0["lambda"] # now indirect to the GLM result/model that's first in the list (best) inspectGLM = h2o_cmd.runInspect(None, key) h2o.verboseprint("GLMGrid inspectGLM:", h2o.dump_json(inspectGLM)) simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)
def test_frame_split(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=10) print "Just split away and see if anything blows up" splitMe = hex_key inspect = h2o_cmd.runInspect(key=splitMe) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] for s in range(20): inspect = h2o_cmd.runInspect(key=splitMe) numRows = inspect['numRows'] numCols = inspect['numCols'] fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5) split0_key = fs['split_keys'][0] split1_key = fs['split_keys'][1] split0_rows = fs['split_rows'][0] split1_rows = fs['split_rows'][1] split0_ratio = fs['split_ratios'][0] split1_ratio = fs['split_ratios'][1] print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split1_key # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols) if split0_rows <= 2: break
def test_parse_1m_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [(10, 65000, "cH", 30)] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() print "Summary should work with 65k" parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, " num_rows:", "{:,}".format( inspect["num_rows"] ), " num_cols:", "{:,}".format(inspect["num_cols"]) # should match # of cols in header or ?? self.assertEqual( inspect["num_cols"], colCount, "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount), ) self.assertEqual( inspect["num_rows"], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % (inspect["num_rows"], rowCount), ) # we should obey max_column_display column_limits = [25, 25000, 50000] for column_limit in column_limits: inspect = h2o_cmd.runInspect( None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs ) self.assertEqual( len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit) ) for r in range(0, len(inspect["rows"])): # NB: +1 below because each row includes a row header row: #{row} self.assertEqual( len(inspect["rows"][r]), column_limit + 1, "inspect data rows obeys max_column_display = " + str(column_limit), )
def test_rapids_basic(self): bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'p' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for execExpr in exprList: r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr) resultKey = r.group(1) execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) if DO_ROLLUP: h2o_cmd.runInspect(key=resultKey) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr) else: h2p.yellow_print("\nNo key created?\n", dump_json(execResult)) print "\nExpressions that created keys. Shouldn't all of these expressions create keys" for k in keys: print k h2o.check_sandbox_for_errors()
def createTestTrain(srcKey, trainDstKey, testDstKey, percent, outputClass, numCols): # will have to live with random extract. will create variance print "train: get random %. change class 4 to 1, everything else to 0. factor() to turn real to int (for rf)" # Create complexity for no good reason!. Do the same thing 5 times in the single exec expressions execExpr = "" STUPID_REPEAT = 20 for i in range(STUPID_REPEAT): execExpr += "a.hex=runif(%s);" % srcKey execExpr += "%s=%s[a.hex%s,];" % (trainDstKey, srcKey, '<=0.9') if not DO_MULTINOMIAL: execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, numCols, trainDstKey, numCols, outputClass) execExpr += "factor(%s[, %s]);" % (trainDstKey, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=trainDstKey, timeoutSecs=STUPID_REPEAT * 15) inspect = h2o_cmd.runInspect(key=trainDstKey) h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) ) print "test: same, but use the same runif() random result, complement" execExpr = "a.hex=runif(%s);" % srcKey execExpr += "%s=%s[a.hex%s,];" % (testDstKey, srcKey, '>0.9') if not DO_MULTINOMIAL: execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, numCols, testDstKey, numCols, outputClass) execExpr += "factor(%s[, %s])" % (testDstKey, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=testDstKey, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=testDstKey) h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (testDstKey, srcKey) )
def test_one_hot_expand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 1100, 'cA', 5), (100, 1000, 'cB', 5), (100, 900, 'cC', 5), (100, 800, 'cD', 5), (100, 700, 'cE', 5), (100, 600, 'cF', 5), (100, 500, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # does it modify the original or ? oneHotResult = h2o.nodes[0].one_hot(source=parseResult['destination_key']) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname)
def simpleCheckGBMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult['destination_key'] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! #type = inspectGG['type'] #if 'unparsed' in type: # print "Warning: GBM Grid result destination_key is unparsed, can't interpret. Ignoring for now" # print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG['response'] # dict ### rows = inspectGG['rows'] #value_size_bytes = inspectGG['value_size_bytes'] model0 = glmGridResult['models'][0] alpha = model0['alpha'] area_under_curve = model0['area_under_curve'] error_0 = model0['error_0'] error_1 = model0['error_1'] model_key = model0['key'] print "best GBM model key:", model_key glm_lambda = model0['lambda'] # now indirect to the GBM result/model that's first in the list (best) inspectGBM = h2o_cmd.runInspect(None, model_key) h2o.verboseprint("GBMGrid inspectGBM:", h2o.dump_json(inspectGBM)) simpleCheckGBM(self, inspectGBM, colX, allowFailWarning=allowFailWarning, **kwargs)
def test_exec2_na_chop(self): bucket = 'home-0xdiag-datasets' csvPathname = 'airlines/year2013.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) inspect = h2o_cmd.runInspect(key='i.hex') print "\nr.hex" \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows1 = inspect['numRows'] numCols = inspect['numCols'] for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, keyX='s.hex', maxTrials=200, timeoutSecs=30, maxCol=numCols-1) inspect = h2o_cmd.runInspect(key='s.hex') print "\ns.hex" \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows2 = inspect['numRows'] print numRows1, numRows2 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_nfold_frame_extract(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=30) print "Just nfold_frame_extract away and see if anything blows up" splitMe = hex_key inspect = h2o_cmd.runInspect(key=splitMe) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] for s in range(20): inspect = h2o_cmd.runInspect(key=splitMe) numRows = inspect['numRows'] numCols = inspect['numCols'] # FIX! should check if afold is outside of nfold range allowance fs = h2o.nodes[0].nfold_frame_extract(source=splitMe, nfolds=2, afold=random.randint(0,1)) print "fs", h2o.dump_json(fs) split0_key = fs['split_keys'][0] split1_key = fs['split_keys'][1] split0_rows = fs['split_rows'][0] split1_rows = fs['split_rows'][1] print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split0_key if split0_rows<=2: break print "Iteration", s
def bigCheckResults(self, kmeans, csvPathname, parseKey, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) model_key = kmeans['destination_key'] kmeansResult = h2o_cmd.runInspect(key=model_key) centers = kmeansResult['KMeansModel']['clusters'] kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseKey['destination_key'], model_key=model_key, destination_key=applyDestinationKey) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) kmeansScoreResult = h2o.nodes[0].kmeans_score( key=parseKey['destination_key'], model_key=model_key) score = kmeansScoreResult['score'] rows_per_cluster = score['rows_per_cluster'] sqr_error_per_cluster = score['sqr_error_per_cluster'] tupleResultList = [] for i,c in enumerate(centers): print "\ncenters["+str(i)+"]: ", centers[i] print "rows_per_cluster["+str(i)+"]: ", rows_per_cluster[i] print "sqr_error_per_cluster["+str(i)+"]: ", sqr_error_per_cluster[i] tupleResultList.append( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) ) return (centers, tupleResultList)
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): destination_key = glmGridResult['destination_key'] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG)) # FIX! currently this is all unparsed! #type = inspectGG['type'] #if 'unparsed' in type: # print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now" # print "Run with -b arg to look at the browser output, for minimal checking of result" ### cols = inspectGG['cols'] response = inspectGG['response'] # dict ### rows = inspectGG['rows'] #value_size_bytes = inspectGG['value_size_bytes'] # FIX! does error_0/1 only exist for binomial? for m, model in enumerate(glmGridResult['models']): alpha = model['alpha'] area_under_curve = model['area_under_curve'] # FIX! should check max error? error_0 = model['error_0'] error_1 = model['error_1'] model_key = model['key'] print "#%s GLM model key: %s" % (m, model_key) glm_lambda = model['lambda'] # now indirect to the GLM result/model that's first in the list (best) inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key']) h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM)) g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs) return g
def test_parse_fs_schmoo_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" # rowData = "1,0,65,1,2,1,1.4,0,6" rowData = "1,0,65,1,2,1,1,0,6" totalRows = 99860 write_syn_dataset(csvPathname, totalRows, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" print "Updating the key and hex_key names for each trial" for trial in range (200): append_syn_dataset(csvPathname, rowData) totalRows += 1 start = time.time() key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "trial #", trial, "totalRows:", totalRows, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' h2o_cmd.runInspect(key=hex_key) # only used this for debug to look at parse (red last row) on failure ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs): # "grid": { # "destination_keys": [ # "GLMGridResults__8222a49156af52532a34fb3ce4304308_0", # "GLMGridResults__8222a49156af52532a34fb3ce4304308_1", # "GLMGridResults__8222a49156af52532a34fb3ce4304308_2" # ] # }, if h2o.beta_features: destination_key = glmGridResult['grid']['destination_keys'][0] inspectGG = h2o.nodes[0].glm_view(destination_key) models = inspectGG['glm_model']['submodels'] h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(models[0])) g = simpleCheckGLM(self, inspectGG, colX, allowFailWarning=allowFailWarning, **kwargs) else: destination_key = glmGridResult['destination_key'] inspectGG = h2o_cmd.runInspect(None, destination_key) h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG)) models = glmGridResult['models'] for m, model in enumerate(models): alpha = model['alpha'] area_under_curve = model['area_under_curve'] # FIX! should check max error? error_0 = model['error_0'] error_1 = model['error_1'] model_key = model['key'] print "#%s GLM model key: %s" % (m, model_key) glm_lambda = model['lambda'] # now indirect to the GLM result/model that's first in the list (best) inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key']) h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM)) g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs) return g
def test_exec2_sum(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' if getpass.getuser()=='jenkins': csvPathname = 'standard/billion_rows.csv.gz' else: csvPathname = '1B/reals_100000x1000_15f.data' csvPathname = '1B/reals_1B_15f.data' csvPathname = '1B/reals_1000000x1000_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) for execExpr in exprList: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "result:", result h2o.check_sandbox_for_errors()
def test_sort_of_prostate_with_row_schmoo(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = "1,0,65,1,2,1,1.4,0,6" write_syn_dataset(csvPathname, 99860, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" print "Updating the key and key2 names for each trial" for trial in range (200): append_syn_dataset(csvPathname, rowData) ### start = time.time() # this was useful to cause failures early on. Not needed eventually ### key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate.csv")) ### print "Trial #", trial, "parse end on ", "prostate.csv" , 'took', time.time() - start, 'seconds' start = time.time() key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2) print "trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' h2o_cmd.runInspect(key=key2) # only used this for debug to look at parse (red last row) on failure ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_parse_summary_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' timeoutSecs = 300 csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary) csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' validation_key = 'test.hex' parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary)
def test_NOPASS_create_frame_fail(self): h2o.beta_features = True for trial in range(20): kwargs = {'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0} print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10) h2o_cmd.infoFromSummary(rSummary) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_GLM2_tnc3_10(self): h2o.beta_features = True csvFilename = 'tnc3_10.csv' print "\n" + csvFilename hex_key = "tnc3.hex" h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1==0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseResult # in any case, the destination_key in parseResult was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1==0): colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1==1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_tnc3_ignore(self): csvFilename = 'tnc3_10.csv' csvPathname = h2o.find_file('smalldata/' + csvFilename) print "\n" + csvPathname key2 = "tnc3.hex" h2b.browseTheCloud() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10) print "Parse result['Key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1==0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): start = time.time() kwargs = {'y': 13, 'num_cross_validation_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseKey # in any case, the destination_key in parseKey was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1==0): colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1==1): start = time.time() kwargs = {'y': 13, 'num_cross_validation_folds': 6} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: # can't use inspect on a model key? now? model = kmeans["model"] model_key = model["_key"] centers = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] kmeansResult = kmeans else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) h2o.verboseprint("kmeans result:", h2o.dump_json(kmeansResult)) model = kmeansResult["KMeansModel"] centers = model["clusters"] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey ) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult["summaries"][0]["hcnt"] # histogram rows_per_cluster = hcnt # FIX! does the cluster order/naming match, compared to cluster variances sqr_error_per_cluster = cluster_variances else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey ) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key) score = kmeansScoreResult["score"] rows_per_cluster = score["rows_per_cluster"] sqr_error_per_cluster = score["sqr_error_per_cluster"] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: model_key = kmeans["model"]["_selfKey"] # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame # can't use inspect on a model key? now? kmeansResult = kmeans model = kmeansResult["model"] centers = model["clusters"] error = model["error"] else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) model = kmeansResult["KMeansModel"] centers = model["clusters"] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey ) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult["summaries"][0]["hcnt"] # histogram rows_per_cluster = hcnt # have to figure out how to get this with fvec sqr_error_per_cluster = [0 for h in hcnt] else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey ) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key) score = kmeansScoreResult["score"] rows_per_cluster = score["rows_per_cluster"] sqr_error_per_cluster = score["sqr_error_per_cluster"] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", centers[i] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def test_rf_float_bigexp_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 1000 colCount = 7 write_syn_dataset(csvPathname, totalRows, colCount, headerData) for trial in range (5): # grow the data set rowData = rand_rowData(colCount) num = random.randint(4096, 10096) append_syn_dataset(csvPathname, colCount, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # cm0 = rfView['drf_model']['cms'][0]['_arr'] # print cm0 # self.assertEqual(len(cm0), numCols, # msg="%s cols in cm, means rf must have ignored some cols. I created data with %s cols" % (len(cm0), numCols-1)) inspect = h2o_cmd.runInspect(key=hex_key) cols = inspect['cols'] numCols = inspect['numCols'] for i,c in enumerate(cols): if i < (numCols-1): # everything except the last col (output) should be 8 byte float colType = c['type'] self.assertEqual(colType, 'Real', msg="col %d should be type Real: %s" % (i, colType)) ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_exec2_xorsum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 1, 'r1', 0, 10, None), ] ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUll, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: start = time.time() (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) print 'exec took', time.time() - start, 'seconds' print "execResult:", h2o.dump_json(execResult) print "" print "%30s" % "fpResult:", "%.15f" % fpResult ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) print "%30s" % "bitResult (0.16x):", "0x%0.16x" % ullResult print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll # print "%30s" % "hex(bitResult):", hex(ullResult) ullResultList.append((ullResult, fpResult)) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) expectedUllAsDouble = h2o_util.unsignedLongLongToDouble(expectedUll) print "%30s" % "expectedUll (0.16x):", "0x%0.16x %s" % (expectedUll, expectedUllAsDouble) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex'} start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_exec2_row_range(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [(1000000, 5, "cA", 200)] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False ) print "Parse:", parseResult["destination_key"], "took", time.time() - start, "seconds" inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) # should match # of cols in header or ?? self.assertEqual( inspect["numCols"], colCount, "parse created result with the wrong number of cols %s %s" % (inspect["numCols"], colCount), ) self.assertEqual( inspect["numRows"], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % (inspect["numRows"], rowCount), ) REPEAT = 1 for i in range(REPEAT): hex_key_i = hex_key + "_" + str(i) execExpr = "%s=%s[1,]" % (hex_key_i, hex_key) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, 100) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, rowCount - 10) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(None, hex_key_i, timeoutSecs=timeoutSecs) h2o_cmd.infoFromInspect(inspect, hex_key_i) print "\n" + hex_key_i, " numRows:", "{:,}".format( inspect["numRows"] ), " numCols:", "{:,}".format(inspect["numCols"])
def test_rf_hhp_2a_fvec(self): h2o.beta_features = True csvFilenameList = { 'hhp.cut3.214.data.gz', } for csvFilename in csvFilenameList: csvPathname = csvFilename print "RF start on ", csvPathname dataKeyTrain = 'rTrain.hex' start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=dataKeyTrain, schema='put', timeoutSecs=120) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] # we want the last col. Should be values 0 to 14. 14 most rare # from the cut3 set # 84777 0 # 13392 1 # 6546 2 # 5716 3 # 4210 4 # 3168 5 # 2009 6 # 1744 7 # 1287 8 # 1150 9 # 1133 10 # 780 11 # 806 12 # 700 13 # 345 14 # 3488 15 execExpr = "%s[,%s] = %s[,%s]==14" % (dataKeyTrain, numCols, dataKeyTrain, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=dataKeyTrain) h2o_cmd.infoFromInspect(inspect, "going into RF") execResult = {'destination_key': dataKeyTrain} kwargs = { 'ntrees': 20, 'max_depth': 20, 'nbins': 50, } rfView = h2o_cmd.runRF(parseResult=execResult, timeoutSecs=900, retryDelaySecs=10, **kwargs) print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds' (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
def test_create_frame_rand1(self): h2o.beta_features = True # default params = { 'rows': 1, 'cols': 1 } for trial in range(20): h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strick checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 kwargs = params.copy() print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') h2o_cmd.runSummary(key='temp1000.hex') print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_frame_split_balance(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just split away and see if anything blows up" splitMe = hex_key inspect = h2o_cmd.runInspect(key=splitMe) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] for s in range(20): inspect = h2o_cmd.runInspect(key=splitMe) numRows = inspect['numRows'] numCols = inspect['numCols'] fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5) split0_key = fs['split_keys'][0] split1_key = fs['split_keys'][1] split0_rows = fs['split_rows'][0] split1_rows = fs['split_rows'][1] split0_ratio = fs['split_ratios'][0] split1_ratio = fs['split_ratios'][1] print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split1_key # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols) if split0_rows <= 2: break print "Now do some rebalancing on the split frames" for trial in range(2): rb_key = "rb_%s_%s" % (trial, splitMe) SEEDPERFILE = random.randint(0, sys.maxint) randChunks = random.randint(1, 100) start = time.time() print "Trial %s: Rebalancing %s to %s with %s chunks" % (trial, splitMe, rb_key, randChunks) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, seed=SEEDPERFILE, chunks=randChunks) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\ h2o_cmd.runSummary(key=rb_key) print "\nInspecting the original parsed result" inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect=inspect) print "\nInspecting the rebalanced result with %s forced chunks" % randChunks inspect = h2o_cmd.runInspect(key=rb_key) h2o_cmd.infoFromInspect(inspect=inspect)
def test_tnc3_ignore(self): csvFilename = "tnc3.csv" csvPathname = h2o.find_file("smalldata/" + csvFilename) print "\n" + csvPathname key2 = "tnc3.hex" h2b.browseTheCloud() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10, header=1) print "Parse result['Key']:", parseKey["destination_key"] inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if 1 == 0: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols( lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10 ) print "\ncolResultList after num swap", colResultList if 1 == 1: print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser" print 'The good case with ignore="boat,body"' rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, ignore="boat,body", csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # ****************** if 1 == 0: colResultList = h2e.exec_expr_list_across_cols( lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10 ) print "\ncolResultList after char swap", colResultList if 1 == 1: print "\nNow the bad case (no ignore)" rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_parse_bounds_libsvm (self): print "Random 0/1 for col1. Last has max col = 1, All have zeros for class." ## h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 100, 'cA', 300), (100000, 100, 'cB', 300), (100, 100000, 'cC', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] row_size = inspect['row_size'] value_size_bytes = inspect['value_size_bytes'] print "\n" + csvPathname, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) summary = summaryResult['summary'] columnsList = summary['columns'] self.assertEqual(colNumberMax+1, len(columnsList), msg="generated %s cols (including output). summary has %s columns" % (colNumberMax+1, len(columnsList))) for columns in columnsList: N = columns['N'] # self.assertEqual(N, rowCount) name = columns['name'] stype = columns['type'] histogram = columns['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] bins = histogram['bins'] nbins = histogram['bins'] # definitely not enums zeros = columns['zeros'] na = columns['na'] smax = columns['max'] smin = columns['min'] mean = columns['mean'] sigma = columns['sigma'] # a single 1 in the last col if name == "V" + str(colNumberMax): # h2o puts a "V" prefix synZeros = num_rows - 1 synSigma = None # not sure..depends on the # rows somehow (0 count vs 1 count) synMean = 1.0/num_rows # why does this need to be a 1 entry list synMin = [0.0, 1.0] synMax = [1.0, 0.0] elif name == ("V1"): # can reverse-engineer the # of zeroes, since data is always 1 synSum = synColSumDict[1] # could get the same sum for all ccols synZeros = num_rows - synSum synSigma = 0.50 synMean = (synSum + 0.0)/num_rows synMin = [0.0, 1.0] synMax = [1.0, 0.0] else: synZeros = num_rows synSigma = 0.0 synMean = 0.0 synMin = [0.0] synMax = [0.0] # print zeros, synZeros self.assertAlmostEqual(float(mean), synMean, places=6, msg='col %s mean %s is not equal to generated mean %s' % (name, mean, 0)) # why are min/max one-entry lists in summary result. Oh..it puts N min, N max self.assertEqual(smin, synMin, msg='col %s min %s is not equal to generated min %s' % (name, smin, synMin)) # reverse engineered the number of zeroes, knowing data was always 1 if present? if name == "V65536" or name == "V65537": print "columns around possible zeros mismatch:", h2o.dump_json(columns) self.assertEqual(zeros, synZeros, msg='col %s zeros %s is not equal to generated zeros count %s' % (name, zeros, synZeros)) self.assertEqual(stype, 'number', msg='col %s type %s is not equal to %s' % (name, stype, 'number')) # our random generation will have some variance for col 1. so just check to 2 places if synSigma: self.assertAlmostEqual(float(sigma), synSigma, delta=0.03, msg='col %s sigma %s is not equal to generated sigma %s' % (name, sigma, synSigma)) if CHECK_MAX: self.assertEqual(smax, synMax, msg='col %s max %s is not equal to generated max %s' % (name, smax, synMax)) self.assertEqual(0, na, msg='col %s num_missing_values %d should be 0' % (name, na))
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1, iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:, i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1, 2, 3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] if 1 == 1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_NN2_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 90 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 20.0, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.04 relTol = 0.01 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)) elapsed = time.time() - start print "parse end on ", hex_key, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { 'cols': None, 'initialization': 'Furthest', 'k': 12 } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \ if __name__ == '__main__': h2o.unit_main()
def test_many_fp_formats_libsvm(self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30, 'sparse'), (100, 100, 'cF', 30, 'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0, 47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseKey['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, num_cols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, num_cols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use num_cols?. num_cols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols)) syn = {} if k == 0: syn['name'] = "Target" syn['size'] = { 1, 2 } # can be two if we actually used the full range 0-255 (need extra for h2o NA) syn['type'] = {'int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' syn['scale'] = {1} # syn['base'] = 0 # syn['variance'] = 0 elif k == 1: # we forced this to always be 0 syn['name'] = "V" + str(k) syn['size'] = {1} syn['type'] = {'int'} syn['min'] = 0 syn['max'] = 0 syn['scale'] = {1} syn['base'] = 0 syn['variance'] = 0 else: syn['name'] = "V" + str(k) syn['size'] = { 1, 2, 4, 8 } # can be 2, 4 or 8? maybe make this a set for membership check syn['type'] = {'int', 'float'} syn['min'] = valMin syn['max'] = valMax syn['scale'] = {1, 10, 100, 1000} # syn['base'] = 0 # syn['variance'] = 0 syn['num_missing_values'] = 0 syn['enum_domain_size'] = 0 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue( syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue( syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'size' or synKey == 'scale' or synKey == 'type': if cols[synKey] not in syn[synKey]: # for debug of why it was a bad size print "cols size/min/max:", cols['size'], cols[ 'min'], cols['max'] print "syn size/min/max:", syn['size'], syn[ 'min'], syn['max'] raise Exception( 'col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual( syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_parse_multi_exclude_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (300, 100, 'cA', 60, '*x[2-5]*'), (310, 200, 'cB', 60, '*x[1,3-5]*'), (320, 300, 'cC', 60, '*x[1-2,4-5]*'), (330, 400, 'cD', 60, '*x[1-3-5]*'), (340, 500, 'cE', 60, '*x[1-4]*'), ] ## h2b.browseTheCloud() cnum = 0 # create them all first for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # FIX! should we add a header to them randomly??? print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) for fileN in range(FILENUM): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList) for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: print f h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put') # pattern match all, then use exclude parseResult = h2i.parse_only(pattern="*syn_*", hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # FIX! h2o strips one of the headers, but treats all the other files with headers as data numRows = inspect['numRows'] numCols = inspect['numCols'] print "\n" + parseResult['destination_key'] + ":", \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # all should have rowCount rows (due to the excludePattern self.assertEqual(numRows, rowCount*FILENUM, msg=("got numRows: %s. Should be rowCount: %s * FILENUM: %s" % \ (numRows, rowCount, FILENUM)))
def test_rf_covtype_train_oobe2(self): print "\nUse randomBitVector and filter to separate the dataset randomly" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # start at 90% rows + 1 # randomBitVector(size,selected) # randomFilter(srcFrame,rows,seed) # filter(srcFrame,bitVect) # odd. output is byte, all other exec outputs are 8 byte? (at least the ones below?) execExpr = "rbv=randomBitVector(" + str(num_rows) + "," + str( last10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey="rbv", timeoutSecs=10) # complement the bit vector execExpr = "not_rbv=colSwap(rbv,0,rbv[0]==0?1:0)" h2o_exec.exec_expr(None, execExpr, resultKey="not_rbv", timeoutSecs=10) execExpr = dataKeyTest + "=filter(" + key2 + ",rbv)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=filter(" + key2 + ",not_rbv)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) ### time.sleep(3600) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1, 10): # always slice from the beginning rowsToUse = rowsForPct[trial % 10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str( rowsToUse) + ")" # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * ( 1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # http://192.168.1.28:54321/GeneratePredictionsPage.html?model_key=__RFModel_0e2531bc-2552-4f65-8a4a-843031b0cb99&key=iris # http://192.168.1.28:54321/RFView.html?data_key=iris.hex&model_key=__RFModel_0e2531bc-2552-4f65-8a4a-843031b0cb99&ntree=50&response_variable=4&class_weights=Iris-setosa%3D1.0%2CIris-versicolor%3D1.0%2CIris-virginica%3D1.0&out_of_bag_error_estimate=true&iterative_cm=true kwargs['iterative_cm'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * ( 1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / num_rows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 3, 'max_depth': 10, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C54', 'classification': 1, } paramsScoreRF = { 'vactual': 'C54', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data': 1, 'ntree': 10, 'depth': 300, 'bin_limit': 20000, 'stat_type': 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit': 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } # train1 trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) if 1 == 0: print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_xl_basic(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexDF = 'v' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexDF) # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') # knon_* key assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) # look at our secret stash in the base class. Should see the DFInit? print "Does the lastExecResult stash work?", dump_json( h2o_xl.Xbase.lastExecResult) # this should work if str(DF) returns DF.frame inspect = h2o_cmd.runInspect(key=a) # print "inspect a", dump_json(inspect) b = DF('b1') assert isinstance(b, DF) inspect = h2o_cmd.runInspect(key=b) # print "inspect b", dump_json(inspect) Assign(a, [0, 0, 0]) assert isinstance(a, Key) b <<= [0, 0, 0] assert isinstance(b, Key) # FIX! how come I have to create c here first for python # see here # http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python # is it too much to require c to exist first? # c = DF() # c <<= a + b # this will trigger ok? c = DF('c1') c <<= [0, 0, 0] assert isinstance(c, Key) # c[0] <<= a + b # Assign(lhs=c[0], rhs=(a + b)) rhs = a + b Assign(c, rhs) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= !c1 (+ %a1 %b1))" assert ast == astExpected, "Actual: %s Expected: %s" % (ast, astExpected) rhs = a[0] + b[0] Assign(c[0], rhs) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #0 #0)))" assert ast == astExpected, "Actual: %s Expected: %s" % (ast, astExpected) Assign(c[1], (a[2] + b[2])) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))" assert ast == astExpected, "Actual: %s Expected: %s" % (ast, astExpected) # assert ast = "(= !b1 (is.na (c {#0})))" assert isinstance(c, Key), type(c) inspect = h2o_cmd.runInspect(key=c) # # print "inspect c", dump_json(inspect) # DF inits the frame # if you just want an existing Key, say existing=True a = DF('a2') # named data frame assert isinstance(a, DF) b = DF('b2') c = DF('c2') inspect = h2o_cmd.runInspect(key=c) # # print "inspect c", dump_json(inspect) a <<= 3 b <<= 3 c <<= 3 c[0] <<= a[0] + b[0] assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) a = DF('a3') # named data frame b = DF('b3') c = DF('c3') a <<= 4 b <<= 4 c <<= 4 c[0] <<= a[0] - b[0] assert isinstance(c, Key) c[0] <<= a[0] * b[0] assert isinstance(c, Key) a = DF('a4') # named data frame b = DF('b4') c = DF('c4') a <<= 5 b <<= 5 c <<= 5 c[0] <<= (a[0] - b[0]) assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) c[0] <<= (a[0] & b[0]) | a[0] assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex' ) ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM_convergence_2(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 1, 'cD', 300), # (100, 100, 'cE', 300), # (100, 200, 'cF', 300), # (100, 300, 'cG', 300), # (100, 400, 'cH', 300), # (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) USEKNOWNFAILURE = False for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) if USEKNOWNFAILURE: csvFilename = 'failtoconverge_100x50.csv' csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema='put') print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 40, 'lambda': 1e0, 'alpha': 0.5, 'link': 'familyDefault', 'n_folds': 0, 'beta_epsilon': 1e-4, 'thresholds': '0:1:0.01', } if USEKNOWNFAILURE: kwargs['y'] = 50 else: kwargs['y'] = y emsg = None for i in range(3): start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print 'glm #', i, 'end on', csvPathname, 'took', time.time( ) - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, allowFailWarning=True, **kwargs) if 1 == 0: print "\n", "\ncoefficients in col order:" # since we're loading the x50 file all the time..the real colCount # should be 50 (0 to 49) if USEKNOWNFAILURE: showCols = 50 else: showCols = colCount for c in range(showCols): print "%s:\t%s" % (c, coefficients[c]) print "intercept:\t", intercept # gets the failed to converge, here, after we see it in the browser too x = re.compile("[Ff]ailed") if warnings: print "warnings:", warnings for w in warnings: print "w:", w if (re.search(x, w)): # first if emsg is None: emsg = w print w if emsg: break if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(5) # gets the failed to converge, here, after we see it in the browser too if emsg is not None: raise Exception(emsg)
def test_GLM_100Mx70_hosts(self): # enable this if you need to re-create the file if 1 == 0: SYNDATASETS_DIR = h2o.make_syn_dir() createList = [ (100000000, 70, 'cA', 10000), ] for (rowCount, colCount, hex_key, timeoutSecs) in createList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # Have to copy it to /home/0xdiag/datasets! if localhost: csvFilenameList = [ # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'), # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'), ('rand_logreg_100000000x70.csv', 500, 'rand_100Mx70.hex'), ] else: # None is okay for hex_key csvFilenameList = [ # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'), # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'), ('rand_logreg_100000000x70.csv', 500, 'rand_100Mx70.hex'), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for csvFilename, timeoutSecs, hex_key in csvFilenameList: csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000, retryDelaySecs=5, initialDelaySecs=10, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) csvPathname = importFolderPath + "/" + csvFilename num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + csvPathname, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) y = num_cols - 1 kwargs = { 'family': 'binomial', 'link': 'logit', 'y': y, 'max_iter': 8, 'n_folds': 0, 'beta_epsilon': 1e-4, 'alpha': 0, 'lambda': 0 } for trial in range(3): start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data") csvPathname = find_file("smalldata/logreg/prostate.csv") import_result = h2o.n0.import_files(path=csvPathname) # print dump_json(import_result) k = import_result['keys'][0] frames_result = h2o.n0.frames(key=k) frame = frames_result['frames'][0] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing_count'] stype = c['type'] domain = c['domain'] # print dump_json(frame) # let's see what ray's util does frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) for k, v in frames_dict.items(): print "frames_dict key:", k # interesting. we can do dictionary comprehensions # { k:v for k,v in my_dict.items() if 'Peter' in k } # how do you parse multiple files parse_result = h2o.n0.parse( key=k, intermediateResults=DO_INTERMEDIATE_RESULTS) frame = parse_result['frames'][0] hex_key = frame['key']['name'] colCount = 9 rowCount = 380 # colCount = 11 # rowCount = 1000000 start = time.time() inspect = h2o_cmd.runInspect(None, hex_key) print "Inspect:", hex_key, "took", time.time() - start, "seconds" numCols = len(inspect['frames'][0]['columns']) numRows = inspect['frames'][0]['rows'] print "\n" + csvPathname, \ " rows:", "{:,}".format(numRows), \ " len(columns):", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount)) verboseprint(hex_key, ":", dump_json(parse_result))
def test_GBM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if h2o.localhost: tryList = [ (10000, 100, 'cA', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, 'cD', 300), (10000, 200, 'cE', 300), (10000, 300, 'cF', 300), (10000, 400, 'cG', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' hdrFilename = 'hdr_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) # hack elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** ntrees = 5 prefixList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] # for max_depth in [5,10,20,40]: for max_depth in [5, 10, 20]: # PARSE a new header**************************************** print "Creating new header", hdrPathname prefix = prefixList.pop(0) write_syn_header(hdrPathname, rowCount, colCount, prefix) # upload and parse the header to a hex hdr_hex_key = prefix + "_hdr.hex" parseHdrResult = h2i.import_parse(bucket=None, path=hdrPathname, schema='put', header=1, # REQUIRED! otherwise will interpret as enums hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, doSummary=False) # Set Column Names (before autoframe is created) h2o.nodes[0].set_column_names(source=hex_key, copy_from=hdr_hex_key) # GBM print "response col name is changing each iteration: parsing a new header" params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': prefix + "_response", 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # works if you delete the autoframe ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe') # just plot the last one if DO_PLOT: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_quantile_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00)), (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)), (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)), (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50, 1.00)), (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00, 100.0)), (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)), (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00)), (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7, 100,00)), (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00)), (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28, 100.00)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta h2o.beta_features = False SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) h2o.beta_features = False csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["num_rows"] numCols = inspect["num_cols"] h2o.beta_features = True summaryResult = h2o_cmd.runSummary(key=hex_key) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) zeros = stats['zeros'] mins = stats['mins'] h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') maxs = stats['maxs'] h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] pctile = stats['pctile'] h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows/len(hcnt) # apparently we're not able to estimate for these datasets # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", compareActual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 h2p.blue_print("\nTrying exec quantile") # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)" # do the equivalent exec quantile? # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds) print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile" h2o.beta_features = True for i, threshold in enumerate(thresholds): # FIX! do two of the same?..use same one for the 2nd if i!=0: # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec)) h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i])) if not result: raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold)) h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='exec percentile: %s too different from expected: %s' % (result, pctile[i])) # for now, do one with all, but no checking else: # This seemed to "work" but how do I get the key name for the list of values returned # the browser result field seemed right, but nulls in the key execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds))) (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key='r2') numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(numCols,1) self.assertEqual(numRows,len(thresholds)) # FIX! should run thru the values in the col? how to get # compare the last one if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=thresholds[-1], # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=result, ) h2o.nodes[0].remove_all_keys()
def test_c10_rel_glm(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' csvFilename = 'classification1Train.txt' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices # since we're no long zero based, increment by 1 x_from_zero = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] x = ['C' + str(i + 1) for i in x_from_zero] # GLM Train*********************************************************** keepPattern = None # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "from goodX (not used) x:", x print "y:", 'C1' # have to use named cols, and they start with 1 kwargs = { 'x' x, 'y': 'C1', # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, }
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): if 1==0: execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] else: (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="h")) # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="r1")) print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_exec2_log_like_R(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' csvPathname = 'airlines/year2013.csv' # csvPathname = '1B/reals_100000x1000_15f.data' # csvPathname = '1B/reals_1000000x1000_15f.data' # csvPathname = '1B/reals_1000000x1_15f.data' # csvPathname = '1B/reals_1B_15f.data' # csvPathname = '1B/reals_100M_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) xList = [] eList = [] fList = [] for execExpr in initList: execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) for trial in range(300): for execExpr in exprList: # put the trial number into the temp for uniqueness execExpr = re.sub('Last.value', 'Last.value%s' % trial, execExpr) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) execTime = time.time() - start print 'exec took', execTime, 'seconds' c = h2o.nodes[0].get_cloud() c = c['nodes'] # print (h2o.dump_json(c)) k = [i['num_keys'] for i in c] v = [i['value_size_bytes'] for i in c] print "keys: %s" % " ".join(map(str, k)) print "value_size_bytes: %s" % " ".join(map(str, v)) # print "result:", result if DO_ORIG: if 'r1' in execExpr: xList.append(trial) eList.append(execTime) if 'log' in execExpr: fList.append(execTime) else: xList.append(trial) eList.append(execTime) fList.append(execTime) h2o.check_sandbox_for_errors() # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' if DO_ORIG: eLabel = 'time: Last.value<trial>.4 = r1[,c(1)]' fLabel = 'time: Last.value<trial>.7 = log(Last.value<trial>.6)' else: eLabel = 'time: Last.value.3 = r2+1' fLabel = 'time: Last.value.3 = r2+1' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_rf_libsvm_fvec(self): h2o.beta_features = True # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("gisette_scale.svm", "cF", 30, 1, 0), ("covtype.binary.svm", "cC", 30, 1, 1), ("mnist_train.svm", "cM", 30, 1, 1), # FIX! fails KMeansScore # not integer output # ("colon-cancer.svm", "cA", 30, 1, 1), ("connect4.svm", "cB", 30, 1, 1), # ("syn_6_1000_10.svm", "cK", 30, 1, 0), Bad libsvm file has the same column multiple times. # float response requires regression ("syn_0_100_1000.svm", "cL", 30, 1, 0), ("mushrooms.svm", "cG", 30, 1, 1), # rf doesn't like reals # ("duke.svm", "cD", 30, 1, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1, 1), # too big for rf (memory error) # ("news20.svm", "cH", 30, 1, 1), # multiclass format ..don't support # ("tmc2007_train.svm", "cJ", 30, 1, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, hex_key, timeoutSecs, resultMult, classification) in csvFilenameList: # have to import each time, because h2o deletes source after parse bucket = "home-0xdiag-datasets" csvPathname = "libsvm/" + csvFilename # PARSE****************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) # RF****************************************** kwargs = { 'ntrees': 1, 'response': 0, 'classification': classification, 'importance': 0, } timeoutSecs = 600 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
def test_summary2_int2B(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (100000, 1, 'B.hex', 2533255332, 2633256000, ('C1', None, None, None, None, None)), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0)) # add 5% for fp errors? maxDelta = 1.05 * maxDelta # also need to add some variance due to random distribution? # maybe a percentage of the mean distMean = (expectedMax - expectedMin) / 2 maxShift = distMean * .01 maxDelta = maxDelta + maxShift SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # only one column column = summaryResult['summaries'][0] colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]) coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # FIX! we should compare mean and sd to expected? mean = stats['mean'] sd = stats['sd'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] print "pct:", pct print "hcnt:", hcnt print "len(hcnt)", len(hcnt) # don't check the last bin for b in hcnt[1:-1]: # should we be able to check for a uniform distribution in the files? e = numRows / len( hcnt ) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution) # apparently we can't estimate any more # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) pt = h2o_util.twoDecimals(pctile) mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "pctile (2 places):", pt print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual) print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn trial += 1 scipyCol = 0
def test_GLM2grid_covtype_many(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'gaussian', 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "glmResult:", h2o.dump_json(glmResult) # assuming it doesn't complete right away, this is the first response # it differs for the last response job_key = glmResult['job_key'] grid_key = glmResult['destination_key'] jobs.append((job_key, grid_key)) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(4): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start # 2/GLMGridView.html?grid_key=asd # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1 for job_key, grid_key in jobs: gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key) h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs