def test_exec2_runif(self): print "in h2o-dev, params are column, min, max, seed" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) # params for h2o-dev runif are: column, min, max, seed AssignObj('r0.hex', KeyIndexed('r.hex', col=0)), AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1)), AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1)), AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)), ] results = [] for execExpr in execExprList: start = time.time() result = execExpr.do(timeoutSecs=30) results.append(result) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1 == 0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [ 0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0 ])
def test_exec2_runif(self): print "in h2o-dev, params are column, min, max, seed" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) # params for h2o-dev runif are: column, min, max, seed AssignObj('r0.hex', KeyIndexed('r.hex',col=0) ), AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1) ), AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1) ), AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1) ), ] results = [] for execExpr in execExprList: start = time.time() result = execExpr.do(timeoutSecs=30) results.append(result) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1==0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0])
def test_exec2_reduction(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' if getpass.getuser()=='jenkins': csvPathname = 'standard/billion_rows.csv.gz' else: csvPathname = '1B/reals_1B_15f.data' csvPathname = '1B/reals_100000x1000_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) for execExpr in initList: result = execExpr.do(timeoutSecs=30) for execExpr in exprList: start = time.time() result = execExpr.do(timeoutSecs=30) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors()
def test_exec2_reduction(self): bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' if getpass.getuser() == 'jenkins': csvPathname = 'standard/billion_rows.csv.gz' else: csvPathname = '1B/reals_1B_15f.data' csvPathname = '1B/reals_100000x1000_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) for execExpr in initList: result = execExpr.do(timeoutSecs=30) for execExpr in exprList: start = time.time() result = execExpr.do(timeoutSecs=30) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors()
def test_parse_file_loop(self): lenNodes = len(h2o.nodes) trial = 0 for i in range(2): for j in range(1,10): # spread the parse around the nodes. Note that keys are produced by H2O, so keys not resused nodeX = random.randint(0,lenNodes-1) parseResult= h2i.import_parse(node=h2o.nodes[nodeX], bucket='smalldata', path='logreg/prostate.csv', schema='put') trial += 1 # dump some cloud info so we can see keys? print "\nAt trial #" + str(trial) c = h2o.nodes[0].get_cloud() print (h2o.dump_json(c))
def test_parse_file_loop(self): lenNodes = len(h2o.nodes) trial = 0 for i in range(2): for j in range(1, 10): # spread the parse around the nodes. Note that keys are produced by H2O, so keys not resused nodeX = random.randint(0, lenNodes - 1) parseResult = h2i.import_parse(node=h2o.nodes[nodeX], bucket='smalldata', path='logreg/prostate.csv', schema='put') trial += 1 # dump some cloud info so we can see keys? print "\nAt trial #" + str(trial) c = h2o.nodes[0].get_cloud() print(h2o.dump_json(c))
def test_a_simple3(self): a = h2o.n0.endpoints() print h2o.dump_json(a) print "There are %s endpoints" % len(a["routes"]) for l in a["routes"]: print l["url_pattern"]
def test_GBMGrid_basic_many(self): trainFilename = 'prostate.csv' train_key = 'prostate.hex' timeoutSecs = 300 csvPathname = "logreg/" + trainFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=train_key, schema='put') pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': train_key, 'ignored_columns': "['ID']", # this has to have [] 'response_column': 'CAPSULE', # 'balance_classes': # 'max_after_balance_size': # ?? # 'ntrees': '[8, 10]', 'ntrees': 8, # 'max_depth': '[8, 9]', 'max_depth': 8, # ?? # 'min_rows': '[1, 2]', 'min_rows': 1, 'nbins': 40, # ?? # 'learn_rate': "[0.1, 0.2]", 'learn_rate': 0.1, # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? # 'variable_importance': False, # 'seed': } jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 for i in range(5): modelKey = 'GBMGrid_prostate_%s', i bmResult = h2o.n0.build_model( algo='gbm', destination_key=modelKey, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') print "GBMResult:", h2o.dump_json(bm) # FIX! is this right for gridded? job_key = bm.jobs[0].key.name # FIX! this isn't a full formed name (%) model_key = bm.jobs[0].dest.name jobs.append( (job_key, model_key) ) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs for job_key, model_key in jobs: modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_a_simple3(self): a = h2o.n0.endpoints() print h2o.dump_json(a) print "There are %s endpoints" % len(a['routes']) for l in a['routes']: print l['url_pattern']
def test_hdfs_cdh5(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle" ("and-testing.data", 60), ### "arcene2_train.both", ### "arcene_train.both", ### "bestbuy_test.csv", ("covtype.data", 60), ("covtype4x.shuffle.data", 60), # "four_billion_rows.csv", ("hhp.unbalanced.012.data.gz", 60), ("hhp.unbalanced.data.gz", 60), ("leads.csv", 60), # ("covtype.169x.data", 1200), ("prostate_long_1G.csv", 200), ("airlines_all.csv", 1200), ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() trial = 0 print "try importing /tmp2" d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000) for (csvFilename, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a.hex" csvPathname = "datasets/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs parse of", csvPathname, "took", time.time() - start, 'secs' pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList if DO_EXPORT: start = time.time() print "Saving", csvFilename, 'to HDFS' print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)" print "Unique per-user to avoid permission issues" username = getpass.getuser() csvPathname = "tmp2/a%s.%s.csv" % (trial, username) # reuse the file name to avoid running out of space csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username) path = "hdfs://"+ h2o.nodes[0].hdfs_name_node + "/" + csvPathname h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs) print "export_files of", hex_key, "to", path, "took", time.time() - start, 'secs' trial += 1 print "Re-Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a2.hex" time.sleep(2) d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs re-parse of", csvPathname, "took", time.time() - start, 'secs'
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): if 1==0: execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] else: (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="h")) # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="r1")) print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_0_NA_2enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 30, '0', 'cC', 100), (100, 30, '0.0', 'cC', 100), (100, 30, '0.0000000', 'cC', 100), ] for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here # assert len(expected) == 6 # FIX! add expected and maxDelta? co = h2o_cmd.runSummary(key=hex_key, column=0) print co.label, co.type, co.missing, co.domain, sum(co.bins) coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] for k,v in co: print k, v if DO_REBALANCE: print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % hex_key start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds' else: rb_key = hex_key print "Now doing to_enum across all columns of %s" % hex_key for column_index in range(colCount): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1) # print "\nto_enum result:", h2o.dump_json(result) co = h2o_cmd.runSummary(key=hex_key, column=column_index+1) print co.label, co.type, co.missing, co.domain, sum(co.bins) coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] if co.type != 'Enum': raise Exception("column %s, which has name %s, didn't convert to Enum, is %s" % (column_index, colname, co.type)) # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1 if co.missing<=0 or co.missing>rowCount: raise Exception("column %s, which has name %s, somehow got NA cnt wrong after convert to Enum %s %s" % (column_index, colname, co.missing, rowCount)) if co.domain!=1: # NAs don't count? # print "stats:", h2o.dump_json(stats) print "column:", h2o.dump_json(co) raise Exception("column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, co.label, domain))
def test_hdfs_hdp2_1(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle" ("and-testing.data", 60), ### "arcene2_train.both", ### "arcene_train.both", ### "bestbuy_test.csv", ("covtype.data", 60), ("covtype4x.shuffle.data", 60), # "four_billion_rows.csv", ("hhp.unbalanced.012.data.gz", 60), ("hhp.unbalanced.data.gz", 60), ("leads.csv", 60), # ("covtype.169x.data", 1200), ("prostate_long_1G.csv", 200), ("airlines_all.csv", 1200), ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() trial = 0 print "try importing /tmp2" d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000) for (csvFilename, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a.hex" csvPathname = "datasets/" + csvFilename # Do a simple typeahead check on the directory # typeaheadResult 2: { # "__meta": { # "schema_name": "TypeaheadV2", # "schema_type": "Iced", # "schema_version": 2 # }, # "limit": 2, # "matches": [ # "hdfs://172.16.2.186/datasets/15Mx2.2k.csv", # "hdfs://172.16.2.186/datasets/1Mx2.2k.NAs.csv" # ], # "src": "hdfs://172.16.2.186/datasets/" # } typeaheadPath = "hdfs://" + h2o.nodes[ 0].hdfs_name_node + "/datasets/" typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath, limit=2) print "typeaheadResult 2:", dump_json(typeaheadResult) assert len(typeaheadResult['matches']) == 2 typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath, limit=0) print "typeaheadResult 0:", dump_json(typeaheadResult) assert len(typeaheadResult['matches']) > 2 typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath, limit=None) print "typeaheadResult 0:", dump_json(typeaheadResult) assert len(typeaheadResult['matches']) > 2 typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath, limit=-1) print "typeaheadResult -1:", dump_json(typeaheadResult) assert len(typeaheadResult['matches']) > 2 parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs parse of", csvPathname, "took", time.time( ) - start, 'secs' pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList if DO_EXPORT: start = time.time() print "Saving", csvFilename, 'to HDFS' print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)" print "Unique per-user to avoid permission issues" username = getpass.getuser() csvPathname = "tmp2/a%s.%s.csv" % (trial, username) # reuse the file name to avoid running out of space csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username) path = "hdfs://" + h2o.nodes[ 0].hdfs_name_node + "/" + csvPathname h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs) print "export_files of", hex_key, "to", path, "took", time.time( ) - start, 'secs' trial += 1 print "Re-Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a2.hex" time.sleep(2) d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs re-parse of", csvPathname, "took", time.time( ) - start, 'secs'
def test_mixed_int_enum_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # this should be a sorted list for comparing to hbrk in the histogram in h2o summary? enumList = ["abc", "def", "ghi"] # numbers 1 and 2 may not be counted as NAs correctly? what about blank space? intList = [0, 1, 2, ""] expectedList = ["abc", "def", "ghi"] tryList = [ # not sure about this case # some of the cases interpret as ints now (not as enum) (ROWS, COLS, "a.hex", enumList[0:1], expectedList[0:1], intList[0:2], False), # colname, (min, COLS5th, 50th, 75th, max) (ROWS, COLS, "b.hex", enumList[0:2], expectedList[0:2], intList[0:1], True), # fails this case (ROWS, COLS, "c.hex", enumList[0:1], expectedList[0:1], intList[0:1], True), (ROWS, COLS, "d.hex", enumList[0:], expectedList[0:], intList[0:1], True), (ROWS, COLS, "e.hex", enumList[0:2], expectedList[0:2], intList[0:2], True), # this case seems to fail (ROWS, COLS, "f.hex", enumList[0:1], expectedList[0:1], intList[0:2], True), # this seems wrong also (ROWS, COLS, "g.hex", enumList[0:], expectedList[0:], intList[0:2], True), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) class Column(object): def __init__(self, column): assert isinstance(column, dict) for k, v in column.iteritems(): setattr(self, k, v) # achieves self.k = v x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices) parseResult = h2i.import_parse( path=csvPathname, schema="put", check_header=0, hex_key=hex_key, timeoutSecs=10, doSummary=False ) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) print "numRows:", numRows, "numCols:", numCols inspect = h2o_cmd.runInspect(None, hex_key) print "\nTrial:", trial, csvFilename # this summary only does one column? # assert colCount == len(columns), "%s %s" % (colCount, len(columns)) for i in range(colCount): summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i + 1)) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # columns = summaryResult['frames'][0]['columns'] co = Column(summaryResult) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] coNameList = [ "co.base", "len(co.bins)", "len(co.data)", "co.domain", "co.label", "co.maxs", "co.mean", "co.mins", "co.missing", "co.ninfs", "co.pctiles", "co.pinfs", "co.precision", "co.sigma", "co.str_data", "co.stride", "co.type", "co.zeros", ] for c, n in zip(coList, coNameList): print n + ":", c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? # This can go to NaN (string) with big numbers # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) # can be None if col is all NA # print "FIX! hacking the co.pctiles because it's short by two" # pctiles = [0] + co.pctiles + [0] assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (co.zeros, numRows) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( co.type, "enum", "Expecting co.type %s to be 'enum' for %s co label %s" % (co.type, i, co.label), ) if ENABLE_ASSERTS and resultIsEnum: # not always there cardinality = len(co.domain) self.assertEqual( cardinality, len(enumChoices), msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices)), ) # assume I create the list above in the same order that h2o will show the order. sorted? if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(co.bins, enumChoices) hcntTotal = sum(co.bins) numRowsCreated = rowCount + len(intChoices) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i]) self.assertEqual( numRows, numRowsCreated, msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated) ) nacnt = co.missing if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( nacnt, expectedNaCnt[i], "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt), ) # FIX! no checks for the case where it got parsed as int column! trial += 1
def test_0_NA_2enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 30, '0', 'cC', 100), (100, 30, '0.0', 'cC', 100), (100, 30, '0.0000000', 'cC', 100), ] for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here # assert len(expected) == 6 # FIX! add expected and maxDelta? co = h2o_cmd.runSummary(key=hex_key, column=0) print co.label, co.type, co.missing, co.domain, sum(co.bins) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] for k, v in co: print k, v if DO_REBALANCE: print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % hex_key start = time.time() print "Rebalancing %s to %s with %s chunks" % ( hex_key, rb_key, REBALANCE_CHUNKS) rebalanceResult = h2o.nodes[0].rebalance( source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds' else: rb_key = hex_key print "Now doing to_enum across all columns of %s" % hex_key for column_index in range(colCount): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index + 1) # print "\nto_enum result:", h2o.dump_json(result) co = h2o_cmd.runSummary(key=hex_key, column=column_index + 1) print co.label, co.type, co.missing, co.domain, sum(co.bins) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] if co.type != 'Enum': raise Exception( "column %s, which has name %s, didn't convert to Enum, is %s" % (column_index, colname, co.type)) # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1 if co.missing <= 0 or co.missing > rowCount: raise Exception( "column %s, which has name %s, somehow got NA cnt wrong after convert to Enum %s %s" % (column_index, colname, co.missing, rowCount)) if co.domain != 1: # NAs don't count? # print "stats:", h2o.dump_json(stats) print "column:", h2o.dump_json(co) raise Exception( "column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, co.label, domain))
def test_mixed_int_enum_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # this should be a sorted list for comparing to hbrk in the histogram in h2o summary? enumList = ['abc', 'def', 'ghi'] # numbers 1 and 2 may not be counted as NAs correctly? what about blank space? intList = [0, 1, 2, ''] expectedList = ['abc', 'def', 'ghi'] tryList = [ # not sure about this case # some of the cases interpret as ints now (not as enum) (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1], intList[0:2], False), # colname, (min, COLS5th, 50th, 75th, max) (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2], intList[0:1], True), # fails this case (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1], intList[0:1], True), (ROWS, COLS, 'd.hex', enumList[0:], expectedList[0:], intList[0:1], True), (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2], intList[0:2], True), # this case seems to fail (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1], intList[0:2], True), # this seems wrong also (ROWS, COLS, 'g.hex', enumList[0:], expectedList[0:], intList[0:2], True), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) class Column(object): def __init__(self, column): assert isinstance(column, dict) for k, v in column.iteritems(): setattr(self, k, v) # achieves self.k = v x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices) parseResult = h2i.import_parse(path=csvPathname, schema='put', check_header=0, hex_key=hex_key, timeoutSecs=10, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) print "numRows:", numRows, "numCols:", numCols inspect = h2o_cmd.runInspect(None, hex_key) print "\nTrial:", trial, csvFilename # this summary only does one column? # assert colCount == len(columns), "%s %s" % (colCount, len(columns)) for i in range(colCount): summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i + 1)) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # columns = summaryResult['frames'][0]['columns'] co = Column(summaryResult) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] coNameList = [ 'co.base', 'len(co.bins)', 'len(co.data)', 'co.domain', 'co.label', 'co.maxs', 'co.mean', 'co.mins', 'co.missing', 'co.ninfs', 'co.pctiles', 'co.pinfs', 'co.precision', 'co.sigma', 'co.str_data', 'co.stride', 'co.type', 'co.zeros', ] for c, n in zip(coList, coNameList): print n + ":", c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) # what is precision. -1? # This can go to NaN (string) with big numbers # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) # can be None if col is all NA # print "FIX! hacking the co.pctiles because it's short by two" # pctiles = [0] + co.pctiles + [0] assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % ( co.zeros, numRows) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( co.type, 'enum', "Expecting co.type %s to be 'enum' for %s co label %s" % (co.type, i, co.label)) if ENABLE_ASSERTS and resultIsEnum: # not always there cardinality = len(co.domain) self.assertEqual( cardinality, len(enumChoices), msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices))) # assume I create the list above in the same order that h2o will show the order. sorted? if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(co.bins, enumChoices) hcntTotal = sum(co.bins) numRowsCreated = rowCount + len(intChoices) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i]) self.assertEqual(numRows, numRowsCreated, msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated)) nacnt = co.missing if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( nacnt, expectedNaCnt[i], "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt)) # FIX! no checks for the case where it got parsed as int column! trial += 1