def setUpClass(cls): global SEED SEED = h2o.setup_random_seed() h2o.init(2) global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir()
def test_parse_rand_utf8(self): SYNDATASETS_DIR = h2o.make_syn_dir() print "HACK: reduce rows to 10 for debug" tryList = [ # do two cols to detect bad eol behavior (10, 2, 'cA', 120), (10, 2, 'cG', 120), (10, 2, 'cH', 120), ] print "What about messages to log (INFO) about unmatched quotes (before eol)" # got this ..trying to avoid for now # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', check_header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "parseResult:", dump_json(parseResult) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # FIX! check type? # print "inspect:", h2o.dump_json(inspect) self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount)) self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
def test_parse_100k_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 1000, 'cA', 200, 200), (10, 2000, 'cA', 200, 200), (10, 4000, 'cA', 200, 200), (10, 8000, 'cA', 200, 200), (10, 9000, 'cA', 200, 200), (10, 10000, 'cA', 200, 200), (10, 100000, 'cA', 200, 200), # (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse( path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False, column_names=None, intermediateResults=DO_INTERMEDIATE_RESULTS) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print "Skipping the delete keys for now" if 1 == 0: # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 100, 'cG', 400), (200000, 100, 'cH', 400), (400000, 100, 'cI', 400), (800000, 100, 'cJ', 400), (1000000, 100, 'cK', 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES-1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) iA = h2o_cmd.InspectObj(pA.parse_key) parseElapsed = pA.python_elapsed parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols print parse_key, parseElapsed, numRows, numCols labelList = iA.labelList node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(numRows) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1==1: xLabel = 'byteSize' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_parse_long_colnames(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 1000, 'cA', 200, 200), (10, 1100, 'cA', 200, 200), (10, 1200, 'cA', 200, 200), (10, 1300, 'cA', 200, 200), (10, 1400, 'cA', 200, 200), (10, 1500, 'cA', 200, 200), (10, 1600, 'cA', 200, 200), (10, 1700, 'cA', 200, 200), (10, 1800, 'cA', 200, 200), (10, 1900, 'cA', 200, 200), (10, 2000, 'cA', 200, 200), (10, 4000, 'cA', 200, 200), (10, 8000, 'cA', 200, 200), (10, 9000, 'cA', 200, 200), (10, 10000, 'cA', 200, 200), # (10, 100000, 'cA', 200, 200), # (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False, column_names=None, intermediateResults=DO_INTERMEDIATE_RESULTS) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print "Skipping the delete keys for now" if 1==0: # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_rapids_mean(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 5, 'cA', 200), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) data_key = hex_key data_key2 = hex_key + "_2" for trial in range(4): result_key = data_key + "_" + str(trial) # copy the key Assign(data_key2, data_key) Assign(result_key, Fcn('mean', KeyIndexed(data_key2, col=0), 0, False)) trial += 1
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 100, 'cF', 600), (100, 5000, 'cF', 600), (100, 10000, 'cF', 600), # (100, 12000, 'cF', 600), # (100, 15000, 'cF', 600), # (100, 17000, 'cF', 600), (100, 20000, 'cF', 600), (100, 40000, 'cF', 600), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 100, 'cF', 600), (100, 5000, 'cF', 600), (100, 10000, 'cF', 600), # (100, 12000, 'cF', 600), # (100, 15000, 'cF', 600), # (100, 17000, 'cF', 600), (100, 20000, 'cF', 600), (100, 40000, 'cF', 600), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
def test_rapids_mean(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 5, 'cA', 200), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) data_key = hex_key data_key2 = hex_key + "_2" for trial in range(4): result_key = data_key + "_" + str(trial) # copy the key Assign(data_key2, data_key) Assign(result_key, Fcn('mean', KeyIndexed(data_key2, col=0), 0, False)) trial += 1
def test_GLM_many_cols_4(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (100000, 10, 'cA', 600), (100000, 100, 'cA', 600), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 labelListUsed = list(labelList) print "labelListUsed", labelListUsed response = labelListUsed[-1] labelListUsed.remove(response) numColsUsed = numCols - 1 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': response, # can't take index now? # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, 'n_folds': 1, } model_key = 'many_cols_glm.hex' bmResult = h2o.n0.build_model(algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_parse_rand_enum_compress(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 1000000 # from command line arg -long if h2o_args.long_test_case: repeat = 1000 scale = 10 # scale up the # of rows tryList = [ (n * scale, 1, 'cI', 300), (n * scale, 1, 'cI', 300), (n * scale, 1, 'cI', 300), ] else: repeat = 1 scale = 1 tryList = [ (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] lastcolsHistory = [] enumList = create_enum_list(listSize=ENUMS_NUM) for r in range(repeat): SEED_PER_FILE = random.randint(0, sys.maxint) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) # why are we saving this? lastcolsHistory.append(lastcols) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, check_header=0, timeoutSecs=30, separator=colSepInt, doSummary=DO_SUMMARY) parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) # optional. only needed to extract parse_key? pA = h2o_cmd.ParseObj(parseResultA, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) self.assertEqual(rowCount, iA.numRows) self.assertEqual(colCount, iA.numCols)
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5*ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5*ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1*ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1*ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1*ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1*ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1*ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1*ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100,00]), (1*ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1*ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin)/1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) numRows = pA.numRows numCols = pA.numCols parse_key = pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) #*************************** # Summary co = h2o_cmd.runSummary(key=parse_key) default_pctiles = co.default_pctiles coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str,probsList)) parameters = { 'model_id': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model( algo='quantile', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iterations']", model.output['iterations'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][0] # why is this a double array iterations = model.output['iterations'] assert iterations == 11, iterations print "quantiles: ", quantiles print "iterations: ", iterations # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()
def test_PCA_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 50, 'cB', 300), (10000, 100, 'cC', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print (rowCount, colCount, hex_key, timeoutSecs) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** modelKey = 'PCAModelKey' scoreKey = 'PCAScoreKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: parameters = { # 'tolerance': tolerance, # 'standardize': 1, 'k': 1, } model_key = 'pca.hex' bmResult = h2o.n0.build_model( algo='pca', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics( model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_mixed_int_enum_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # this should be a sorted list for comparing to hbrk in the histogram in h2o summary? enumList = ["abc", "def", "ghi"] # numbers 1 and 2 may not be counted as NAs correctly? what about blank space? intList = [0, 1, 2, ""] expectedList = ["abc", "def", "ghi"] tryList = [ # not sure about this case # some of the cases interpret as ints now (not as enum) (ROWS, COLS, "a.hex", enumList[0:1], expectedList[0:1], intList[0:2], False), # colname, (min, COLS5th, 50th, 75th, max) (ROWS, COLS, "b.hex", enumList[0:2], expectedList[0:2], intList[0:1], True), # fails this case (ROWS, COLS, "c.hex", enumList[0:1], expectedList[0:1], intList[0:1], True), (ROWS, COLS, "d.hex", enumList[0:], expectedList[0:], intList[0:1], True), (ROWS, COLS, "e.hex", enumList[0:2], expectedList[0:2], intList[0:2], True), # this case seems to fail (ROWS, COLS, "f.hex", enumList[0:1], expectedList[0:1], intList[0:2], True), # this seems wrong also (ROWS, COLS, "g.hex", enumList[0:], expectedList[0:], intList[0:2], True), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) class Column(object): def __init__(self, column): assert isinstance(column, dict) for k, v in column.iteritems(): setattr(self, k, v) # achieves self.k = v x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices) parseResult = h2i.import_parse( path=csvPathname, schema="put", check_header=0, hex_key=hex_key, timeoutSecs=10, doSummary=False ) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) print "numRows:", numRows, "numCols:", numCols inspect = h2o_cmd.runInspect(None, hex_key) print "\nTrial:", trial, csvFilename # this summary only does one column? # assert colCount == len(columns), "%s %s" % (colCount, len(columns)) for i in range(colCount): summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i + 1)) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # columns = summaryResult['frames'][0]['columns'] co = Column(summaryResult) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] coNameList = [ "co.base", "len(co.bins)", "len(co.data)", "co.domain", "co.label", "co.maxs", "co.mean", "co.mins", "co.missing", "co.ninfs", "co.pctiles", "co.pinfs", "co.precision", "co.sigma", "co.str_data", "co.stride", "co.type", "co.zeros", ] for c, n in zip(coList, coNameList): print n + ":", c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? # This can go to NaN (string) with big numbers # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) # can be None if col is all NA # print "FIX! hacking the co.pctiles because it's short by two" # pctiles = [0] + co.pctiles + [0] assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (co.zeros, numRows) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( co.type, "enum", "Expecting co.type %s to be 'enum' for %s co label %s" % (co.type, i, co.label), ) if ENABLE_ASSERTS and resultIsEnum: # not always there cardinality = len(co.domain) self.assertEqual( cardinality, len(enumChoices), msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices)), ) # assume I create the list above in the same order that h2o will show the order. sorted? if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(co.bins, enumChoices) hcntTotal = sum(co.bins) numRowsCreated = rowCount + len(intChoices) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i]) self.assertEqual( numRows, numRowsCreated, msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated) ) nacnt = co.missing if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( nacnt, expectedNaCnt[i], "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt), ) # FIX! no checks for the case where it got parsed as int column! trial += 1
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [(n, 10, 9, "cE", 300)] # create key names to use for exec eKeys = ["e%s" % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = "p" colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, "cut expressions" for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1, iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key("p") for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn("==", c, pKey[:, i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn("&", cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign("b", [1, 2, 3]) # could also append 1 col at a time, by assigning to the next col number? Assign("a", Cbind(["b" for i in range(colCount)])) for eKey in eKeys: Assign(eKey, "a") ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] if 1 == 1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, "took", elapsed, "seconds." trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = "trial" eLabel = "exec cut time" fLabel = "quantile time" eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_0_NA_2enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 30, '0', 'cC', 100), (100, 30, '0.0', 'cC', 100), (100, 30, '0.0000000', 'cC', 100), ] for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here # assert len(expected) == 6 # FIX! add expected and maxDelta? co = h2o_cmd.runSummary(key=hex_key, column=0) print co.label, co.type, co.missing, co.domain, sum(co.bins) coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] for k,v in co: print k, v if DO_REBALANCE: print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % hex_key start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds' else: rb_key = hex_key print "Now doing to_enum across all columns of %s" % hex_key for column_index in range(colCount): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1) # print "\nto_enum result:", h2o.dump_json(result) co = h2o_cmd.runSummary(key=hex_key, column=column_index+1) print co.label, co.type, co.missing, co.domain, sum(co.bins) coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] if co.type != 'Enum': raise Exception("column %s, which has name %s, didn't convert to Enum, is %s" % (column_index, colname, co.type)) # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1 if co.missing<=0 or co.missing>rowCount: raise Exception("column %s, which has name %s, somehow got NA cnt wrong after convert to Enum %s %s" % (column_index, colname, co.missing, rowCount)) if co.domain!=1: # NAs don't count? # print "stats:", h2o.dump_json(stats) print "column:", h2o.dump_json(co) raise Exception("column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, co.label, domain))
def test_w2v_basic(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 500000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, check_header=1, delete_on_done=0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) cA = h2o_test.OutputObj(iA.columns[0], "inspect_column") parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList for i in range(colCount): print cA.type, cA.missing_count self.assertEqual( 0, cA.missing_count, "Column %s Expected %s. missing: %s is incorrect" % (i, 0, cA.missing_count)) self.assertEqual( 'string', cA.type, "Column %s Expected %s. type: %s is incorrect" % (i, 0, cA.type)) if DO_SUMMARY: for i in range(colCount): co = h2o_cmd.runSummary(key=parse_key, column=i) print co.label, co.type, co.missing, co.domain, sum( co.bins) self.assertEqual( 0, co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % (i, 0, co.missing_count)) self.assertEqual( 'String', co.type, "Column %s Expected %s. type: %s is incorrect" % (i, 0, co.type)) # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'minWordFreq': 5, # int 5 [] 'wordModel': 'SkipGram', # enum [u'CBOW', u'SkipGram'] 'normModel': 'HSM', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 5, # int 5 [] 'vecSize': 100, # int 100 'windowSize': 5, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model(algo='word2vec', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') # not implemented? # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_w2v_basic_2(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 100 tryList = [ # (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) hex_key = "not_used.hex" # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, check_header=1, delete_on_done = 0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList src_key = h2i.find_key('syn_.*csv') # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'minWordFreq': 1, # int 5 [] 'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram'] 'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 1,# int 5 [] 'vecSize': 10, # int 100 'windowSize': 2, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model( algo='word2vec', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics( model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_rapids_overloaded_opr(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('s1', Seq(range(5))) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created # first try as object, then method Assign('s2', Fcn('c', Seq(range(5)))) # just combine Assign('s3', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s3') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 Assign('s2', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 # can't have sequence of sequences? # make sure key is created with c() f = Fcn( 'c', Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s1', f) f = Col( Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s2', f) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 313 assert numCols == 1 print "Now trying to do the functions with the alternate overloaded operators" data_key = Key(parse_key) result_key = Key() # what triggers immediate operation at h2o # as opposed to an object within a function result_key.frame = 'a1' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a2' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a3' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a4' result_key <<= data_key[Seq(range(1, 4)), 0:1] result_key.frame = 'a5' result_key <<= data_key[Seq(range(1, 4)), 0:1] result_key.frame = 'a6' result_key <<= data_key[[1, 2, 3], 1] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_rapids_funs_1op(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key data_key2 = hex_key + "_2" trial = 0 good = [] bad = [] both = h2o_xl.xFcnOp1Set.union(h2o_xl.xFcnOp3Set) both = h2o_xl.xFcnOp1Set for fun in both: a = None try: result_key = data_key + "_" + str(trial) # copy the key Assign(data_key2, data_key) # a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('sum', KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('xorsum', KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('sqrt', KeyIndexed(data_key2, col=0))) # a = Assign(result_key, Fcn('ncol', KeyIndexed(data_key2, col=0))) # what's wrong with mean? if fun in ['ncol', 'asin', 'any.factor', 'sin', 'atan', 'tan', 'sign', 'log', 'exp', 'sqrt', 'abs', 'floor', 'ceiling', 'trunc','is.factor', 'is.na', 'any.na', 'nrow', 'tanh', 'length', 'acos', 'cos', 'sinh', 'cosh']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0))) good.append(fun) elif fun in ['sum', 'max', 'min', 'xorsum', 'sd']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True)) good.append(fun) elif fun in ['scale']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False)) good.append(fun) elif fun in ['round', 'signif']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1)) good.append(fun) elif fun in ['seq_len', 'rep_len']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 4)) good.append(fun) elif fun in ['seq']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1, 5, 1)) good.append(fun) elif fun in ['mean']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 0, False)) good.append(fun) elif fun in ['var']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False, False)) good.append(fun) elif fun in ['match']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), KeyIndexed(data_key2, col=0), 1, None)) good.append(fun) elif fun in ['unique']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, 10, 1)) good.append(fun) else: # bad functions kill h2o? a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), None)) bad.append(fun) # a = Fcn(fun, KeyIndexed(data_key, col=0), '%FALSE ') # a = Fcn(fun, data_key, '%FALSE') # a = Fcn(fun, data_key) # scalars? if 1==0: inspect = h2o_cmd.runInspect(key=result_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==1000, numRows assert numCols==1, numCols print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) except: if not a: # print dump_json(a.execResult) bad.append(fun) trial += 1 print "good:", good print "bad:", bad
def test_rapids_row_range(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) # Assign('s1', Seq(range(5)) ).do Assign('s1', Seq(range(5)) ) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created # first try as object, then method Assign('s2', Fcn('c', Seq(range(5)) )) print dump_json(Xbase.lastExecResult) print dump_json(Xbase.lastResult) # just combine Assign('s3', Col(Seq(range(5)) )) inspect = h2o_cmd.runInspect(key='s3') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==5 assert numCols==1 Assign('s2', Col(Seq(range(5))) ) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==5 assert numCols==1 # can't have sequence of sequences? # make sure key is created with c() f = Fcn('c', Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) )) Assign('s1', f) f = Col(Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) )) Assign('s2', f) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==313 assert numCols==1 print "z1" Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5))) ) print "z2" Assign('s1', KeyIndexed(data_key, row=Seq(Colon(99, 400), "#2", 1, range(1,5))) ) print "z3" Assign(result_key, KeyIndexed(data_key, row='#1')).do print "z4" Assign(result_key, KeyIndexed(data_key, row=Colon('#1', '#100'))) print "z5" Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100))) # this should fail rapids because of reverse msb/lsb # illegal, detected # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1'))) print "z6" Assign(result_key, KeyIndexed(data_key, row=Colon('#-2', '#-1'))) print "z7" Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1))) # illegal, detected # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2'))) # take advantage of number to string conversion print "z8" Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount-10))) print "z9" Assign(result_key, KeyIndexed(data_key, col=Colon('#1', colCount-1, ))) # no assign print "z10" result = KeyIndexed(data_key, row=Colon('#1', rowCount-10)).do() print "z11" # result = KeyIndexed(data_key, col=Colon('#1', colCount-1,)).do() # do some function translation print "z12" # result = Fcn('==', 1, KeyIndexed(data_key, col=Colon('#1', colCount-1,))).do() print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception("Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only(pattern='*'+rowxcol+'*', hex_key=hex_key, timeoutSecs=timeoutSecs, check_header="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_columns': "['ID','CAPSULE']"} else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error ? supposedly fixed now # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxErr = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxErr = 1.05 * maxErr expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here assert len(expected) == 6 co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr) trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "maxErr", maxErr if co.label != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.99, h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_GLM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (2, 100, 'cA', 300), # (4, 200, 'cA', 300), (10000, 1000, 'cB', 300), (10000, 3000, 'cC', 500), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 labelListUsed = list(labelList) response = 'C' + str(len(labelListUsed)-1) # last column labelListUsed.remove(response) numColsUsed = numCols - 1 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': response, # can't take index now? # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, # NPE with n_folds 2? 'n_folds': 1, } model_key = 'many_cols_glm.hex' bmResult = h2o.n0.build_model( algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=120) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_parse_full_rand(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 1000000 # from command line arg -long if 1==0: repeat = 1000 scale = 10 # scale up the # of rows tryList = [ (n*scale, 3, 'cI', 300), ] else: repeat = 1 scale = 1 tryList = [ (n, 3, 'cI', 300), ] lastcolsHistory = [] for r in range(repeat): SEED_PER_FILE = random.randint(0, sys.maxint) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() lastcols = write_syn_dataset(csvPathname, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) # why are we saving this? lastcolsHistory.append(lastcols) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, check_header=0, timeoutSecs=60, separator=colSepInt, doSummary=DO_SUMMARY) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) # print "missingValuesList", missingValuesList # for mv in missingValuesList: # self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, # msg='mv %s is not approx. expected %s' % (mv, expectedNA)) # might have extra rows if numRows < rowCount: raise Exception("Expect numRows %s >= rowCount %s since we can have extra eols" % (numRows, rowCount)) # numCols should be right? self.assertEqual(colCount, numCols)
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(colCount): # walks across the columns triggering a summary on the col desired # runSummary returns a column object now. inspect and parse don't. They return json. # maybe eventually will make them return object? But I also pass expected stuff to them # should I pass expected to summary? no, more complex? co = h2o_cmd.runSummary(key=hex_key, column=i) print co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) print "\nComparing column %s to expected" % i self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \ (i, expectedNaCnt[i], co.missing_count)) self.assertEqual(rowCount - expectedNaCnt[i], sum(co.histogram_bins)) h2p.green_print("\nDone with trial", trial) trial += 1 h2i.delete_keys_at_all_nodes()
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList numColsUsed = numCols labelListUsed = labelList ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? destination_key = 'syn_spheres100.hex' cols = ",".join(map(str, range(DIMENSIONS))) for trial in range(2): parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'k': CLUSTERS, 'max_iterations': 50, 'standardize': False, # 'seed': kmeansSeed, 'init': 'Furthest', # [u'Random', u'PlusPlus', u'Furthest', u'User'] # 'dropNA20Cols': False, # 'user_points': userPointsKey } timeoutSecs = 100 model_key = 'sphere100_k.hex' kmeansResult = h2o.n0.build_model(algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=timeoutSecs) modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) # no expected row/error? expected = [(None, c, None, None) for c in centersList] expected.sort(key=lambda tup: sum(tup[1])) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta=[.01, .01, .01]) print "Trial #", trial, "completed"
def test_parse_time(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = None colCount = COLS # rowCount = 1000 rowCount = ROWS write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range (20): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) # src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "A trial #", trial # optional. only needed to extract parse_key? pA = h2o_cmd.ParseObj(parseResultA, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(key=pA.parse_key, csvPathname=csvDownloadPathname) # do a little testing of saving the key as a csv # remove the original parsed key. source was already removed by h2o if 1==0: h2o.nodes[0].remove_key(pA.parse_key) # interesting. what happens when we do csv download with time data? parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', hex_key=hex_key) print "B trial #", trial pB = h2o_cmd.ParseObj(parseResultB, expectedNumRows=rowCount, expectedNumCols=colCount) print pB.numRows print pB.numCols print pB.parse_key iB = h2o_cmd.InspectObj(pB.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) # these checks are redundant now self.assertEqual(iA.missingList, iB.missingList, "missingValuesList mismatches after re-parse of downloadCsv result") self.assertEqual(iA.numCols, iB.numCols, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # so I guess that's okay. So allow for an extra row here. self.assertEqual(iA.numRows, iB.numRows, "pA.numRows: %s pB.numRows: %s mismatch after re-parse of downloadCsv result" % \ (iA.numRows, iB.numRows) ) print "H2O writes the internal format (number) out for time." # ==> syn_time.csv <== # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30 # ==> csvDownload.csv <== # "0","1","2","3","4","5" # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12 h2o.check_sandbox_for_errors()
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception( "Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only( pattern='*' + rowxcol + '*', hex_key=hex_key, timeoutSecs=timeoutSecs, check_header="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_columns': "['ID','CAPSULE']" } else: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1 } rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(colCount): # walks across the columns triggering a summary on the col desired # runSummary returns a column object now. inspect and parse don't. They return json. # maybe eventually will make them return object? But I also pass expected stuff to them # should I pass expected to summary? no, more complex? co = h2o_cmd.runSummary(key=hex_key, column=i) print co.label, co.type, co.missing_count, co.domain, sum(co.histogram_bins) print "\nComparing column %s to expected" % i self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \ (i, expectedNaCnt[i], co.missing_count)) self.assertEqual(rowCount - expectedNaCnt[i], sum(co.histogram_bins)) h2p.green_print("\nDone with trial", trial) trial += 1 h2i.delete_keys_at_all_nodes()
def test_rapids_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('seq1', Seq(range(5)) ) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created Assign('seq2', Fcn('c', Seq(range(5)) )) inspect = h2o_cmd.runInspect(key='seq1') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) Assign('seq3', Col(Seq(range(5))) ) inspect = h2o_cmd.runInspect(key='seq2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) # can't have sequence of sequences? # make sure key is created with c() Assign('seq4', Fcn('c', Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) )) ) inspect = h2o_cmd.runInspect(key='seq1') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5))) ) Assign('seq5', KeyIndexed(data_key, row=Seq(Colon(99, 400), "#2", 1, range(1,5))) ) # they need to be same size # Assign('seq6', Key('seq5') + Key('seq4') + Key('seq3')) # doesn't like my cut? complains on FALSE # Assign(result_key, Cut(KeyIndexed(data_key, col=0))) # Assign(result_key, Cut(KeyIndexed(data_key, col=1), breaks=3)) Assign(result_key, Fcn('min', KeyIndexed(data_key, col=1), True)) Assign(result_key, Fcn('max', KeyIndexed(data_key, col=1), True)) Assign(result_key, Fcn('mean', KeyIndexed(data_key, col=1), 0, False)) Assign(result_key, KeyIndexed(data_key, row='#1')) Assign(result_key, KeyIndexed(data_key, row=Colon('#1', '#100'))) Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100))) # this should fail rapids because of reverse msb/lsb # illegal, detected # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1'))) Assign(result_key, KeyIndexed(data_key, row=Colon('#-2', '#-1'))) Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1))) # illegal, detected # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2'))) # take advantage of number to string conversion Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount-10))) Assign(result_key, KeyIndexed(data_key, col=Colon('#1', colCount-1, ))) # no assign. Expr() complains when result has no key? Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount-10))) Assign(result_key, KeyIndexed(data_key, col=Colon('#1', colCount-1,))) # do some function translation Assign(result_key, Fcn('==', 1, KeyIndexed(data_key, col=Colon('#1', colCount-1,))) ) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def setUpClass(cls): h2o.init() global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir()
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1, iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:, i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1, 2, 3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] if 1 == 1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_w2v_basic_2(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 100 tryList = [ # (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) hex_key = "not_used.hex" # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, check_header=1, delete_on_done=0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList src_key = h2i.find_key('syn_.*csv') # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'minWordFreq': 1, # int 5 [] 'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram'] 'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 1, # int 5 [] 'vecSize': 10, # int 100 'windowSize': 2, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model(algo='word2vec', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): if 1==0: execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] else: (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="h")) # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="r1")) print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_summary_stepping(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', .4900, .5000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -.5000, -.4900, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 490, 500, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -500, -490, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 49000, 50000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -50000, -49000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 4900, 5000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -5000, -4900, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, rangeMin, rangeMax, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax # add 5% for fp errors? maxErr = ((expectedMax - expectedMin)/1000) * 1.05 expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here assert len(expected) == 6 co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr) trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "maxErr", maxErr if co.label!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.99, h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_PCA_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 50, 'cB', 300), (10000, 100, 'cC', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print(rowCount, colCount, hex_key, timeoutSecs) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** modelKey = 'PCAModelKey' scoreKey = 'PCAScoreKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # PCA(tolerance iterate)**************************************** for tolerance in [i / 10.0 for i in range(11)]: parameters = { # 'tolerance': tolerance, # 'standardize': 1, 'k': 1, } model_key = 'pca.hex' bmResult = h2o.n0.build_model(algo='pca', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_rapids_overloaded_opr(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('s1', Seq(range(5)) ) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created # first try as object, then method Assign('s2', Fcn('c', Seq(range(5)) )) # just combine Assign('s3', Col(Seq(range(5)) )) inspect = h2o_cmd.runInspect(key='s3') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==5 assert numCols==1 Assign('s2', Col(Seq(range(5))) ) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==5 assert numCols==1 # can't have sequence of sequences? # make sure key is created with c() f = Fcn('c', Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) )) Assign('s1', f) f = Col(Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) )) Assign('s2', f) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==313 assert numCols==1 print "Now trying to do the functions with the alternate overloaded operators" data_key = Key(parse_key) result_key = Key() # what triggers immediate operation at h2o # as opposed to an object within a function result_key.frame = 'a1' result_key <<= data_key[Seq(range(1,4)), :] result_key.frame = 'a2' result_key <<= data_key[Seq(range(1,4)), :] result_key.frame = 'a3' result_key <<= data_key[Seq(range(1,4)), :] result_key.frame = 'a4' result_key <<= data_key[Seq(range(1,4)), 0:1] result_key.frame = 'a5' result_key <<= data_key[Seq(range(1,4)), 0:1] result_key.frame = 'a6' result_key <<= data_key[[1,2,3], 1] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_mixed_int_enum_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # this should be a sorted list for comparing to hbrk in the histogram in h2o summary? enumList = ['abc', 'def', 'ghi'] # numbers 1 and 2 may not be counted as NAs correctly? what about blank space? intList = [0, 1, 2, ''] expectedList = ['abc', 'def', 'ghi'] tryList = [ # not sure about this case # some of the cases interpret as ints now (not as enum) (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1], intList[0:2], False), # colname, (min, COLS5th, 50th, 75th, max) (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2], intList[0:1], True), # fails this case (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1], intList[0:1], True), (ROWS, COLS, 'd.hex', enumList[0:], expectedList[0:], intList[0:1], True), (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2], intList[0:2], True), # this case seems to fail (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1], intList[0:2], True), # this seems wrong also (ROWS, COLS, 'g.hex', enumList[0:], expectedList[0:], intList[0:2], True), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) class Column(object): def __init__(self, column): assert isinstance(column, dict) for k, v in column.iteritems(): setattr(self, k, v) # achieves self.k = v x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices) parseResult = h2i.import_parse(path=csvPathname, schema='put', check_header=0, hex_key=hex_key, timeoutSecs=10, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) print "numRows:", numRows, "numCols:", numCols inspect = h2o_cmd.runInspect(None, hex_key) print "\nTrial:", trial, csvFilename # this summary only does one column? # assert colCount == len(columns), "%s %s" % (colCount, len(columns)) for i in range(colCount): summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i + 1)) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # columns = summaryResult['frames'][0]['columns'] co = Column(summaryResult) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] coNameList = [ 'co.base', 'len(co.bins)', 'len(co.data)', 'co.domain', 'co.label', 'co.maxs', 'co.mean', 'co.mins', 'co.missing', 'co.ninfs', 'co.pctiles', 'co.pinfs', 'co.precision', 'co.sigma', 'co.str_data', 'co.stride', 'co.type', 'co.zeros', ] for c, n in zip(coList, coNameList): print n + ":", c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) # what is precision. -1? # This can go to NaN (string) with big numbers # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) # can be None if col is all NA # print "FIX! hacking the co.pctiles because it's short by two" # pctiles = [0] + co.pctiles + [0] assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % ( co.zeros, numRows) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( co.type, 'enum', "Expecting co.type %s to be 'enum' for %s co label %s" % (co.type, i, co.label)) if ENABLE_ASSERTS and resultIsEnum: # not always there cardinality = len(co.domain) self.assertEqual( cardinality, len(enumChoices), msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices))) # assume I create the list above in the same order that h2o will show the order. sorted? if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(co.bins, enumChoices) hcntTotal = sum(co.bins) numRowsCreated = rowCount + len(intChoices) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i]) self.assertEqual(numRows, numRowsCreated, msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated)) nacnt = co.missing if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( nacnt, expectedNaCnt[i], "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt)) # FIX! no checks for the case where it got parsed as int column! trial += 1
def test_rapids_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('seq1', Seq(range(5))) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created Assign('seq2', Fcn('c', Seq(range(5)))) inspect = h2o_cmd.runInspect(key='seq1') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) Assign('seq3', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='seq2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # can't have sequence of sequences? # make sure key is created with c() Assign( 'seq4', Fcn( 'c', Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52)))) inspect = h2o_cmd.runInspect(key='seq1') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5)))) Assign( 'seq5', KeyIndexed(data_key, row=Seq(Colon(99, 400), "#2", 1, range(1, 5)))) # they need to be same size # Assign('seq6', Key('seq5') + Key('seq4') + Key('seq3')) # doesn't like my cut? complains on FALSE # Assign(result_key, Cut(KeyIndexed(data_key, col=0))) # Assign(result_key, Cut(KeyIndexed(data_key, col=1), breaks=3)) Assign(result_key, Fcn('min', KeyIndexed(data_key, col=1), True)) Assign(result_key, Fcn('max', KeyIndexed(data_key, col=1), True)) Assign(result_key, Fcn('mean', KeyIndexed(data_key, col=1), 0, False)) Assign(result_key, KeyIndexed(data_key, row='#1')) Assign(result_key, KeyIndexed(data_key, row=Colon('#1', '#100'))) Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100))) # this should fail rapids because of reverse msb/lsb # illegal, detected # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1'))) Assign(result_key, KeyIndexed(data_key, row=Colon('#-2', '#-1'))) Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1))) # illegal, detected # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2'))) # take advantage of number to string conversion Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount - 10))) Assign(result_key, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, ))) # no assign. Expr() complains when result has no key? Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount - 10))) Assign(result_key, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, ))) # do some function translation Assign( result_key, Fcn('==', 1, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, )))) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList numColsUsed = numCols labelListUsed = labelList ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? destination_key = 'syn_spheres100.hex' cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(2): parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'k': CLUSTERS, 'max_iterations': 50, 'standardize': False, # 'seed': kmeansSeed, 'init': 'Furthest', # [u'Random', u'PlusPlus', u'Furthest', u'User'] # 'dropNA20Cols': False, # 'user_points': userPointsKey } timeoutSecs = 100 model_key = 'sphere100_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=timeoutSecs) modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) # no expected row/error? expected = [(None, c, None, None) for c in centersList] expected.sort(key=lambda tup: sum(tup[1])) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta=[.01, .01, .01]) print "Trial #", trial, "completed"
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5 * ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5 * ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1 * ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1 * ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1 * ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1 * ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1 * ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]), (1 * ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1 * ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) numRows = pA.numRows numCols = pA.numCols parse_key = pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) #*************************** # Summary co = h2o_cmd.runSummary(key=parse_key) default_pctiles = co.default_pctiles coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals( co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str, probsList)) parameters = { 'model_id': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model(algo='quantile', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iterations']", model.output['iterations'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][ 0] # why is this a double array iterations = model.output['iterations'] assert iterations == 11, iterations print "quantiles: ", quantiles print "iterations: ", iterations # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 100, 'cG', 400), (200000, 100, 'cH', 400), (400000, 100, 'cI', 400), (800000, 100, 'cJ', 400), (1000000, 100, 'cK', 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES - 1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) iA = h2o_cmd.InspectObj(pA.parse_key) parseElapsed = pA.python_elapsed parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols print parse_key, parseElapsed, numRows, numCols labelList = iA.labelList node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(numRows) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1 == 1: xLabel = 'byteSize' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_w2v_basic(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 500000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, check_header=1, delete_on_done = 0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) cA = h2o_test.OutputObj(iA.columns[0], "inspect_column") parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList for i in range(colCount): print cA.type, cA.missing_count self.assertEqual(0, cA.missing_count, "Column %s Expected %s. missing: %s is incorrect" % (i, 0, cA.missing_count)) self.assertEqual('string', cA.type, "Column %s Expected %s. type: %s is incorrect" % (i, 0, cA.type)) if DO_SUMMARY: for i in range(colCount): co = h2o_cmd.runSummary(key=parse_key, column=i) print co.label, co.type, co.missing, co.domain, sum(co.bins) self.assertEqual(0, co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % (i, 0, co.missing_count)) self.assertEqual('String', co.type, "Column %s Expected %s. type: %s is incorrect" % (i, 0, co.type)) # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'minWordFreq': 5, # int 5 [] 'wordModel': 'SkipGram', # enum [u'CBOW', u'SkipGram'] 'normModel': 'HSM', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 5,# int 5 [] 'vecSize': 100, # int 100 'windowSize': 5, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model( algo='word2vec', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics( model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') # not implemented? # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_parse_rand_enum_compress(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 1000000 # from command line arg -long if h2o_args.long_test_case: repeat = 1000 scale = 10 # scale up the # of rows tryList = [ (n*scale, 1, 'cI', 300), (n*scale, 1, 'cI', 300), (n*scale, 1, 'cI', 300), ] else: repeat = 1 scale = 1 tryList = [ (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] lastcolsHistory = [] enumList = create_enum_list(listSize=ENUMS_NUM) for r in range(repeat): SEED_PER_FILE = random.randint(0, sys.maxint) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) # why are we saving this? lastcolsHistory.append(lastcols) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, check_header=0, timeoutSecs=30, separator=colSepInt, doSummary=DO_SUMMARY) parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) # optional. only needed to extract parse_key? pA = h2o_cmd.ParseObj(parseResultA, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) self.assertEqual(rowCount, iA.numRows) self.assertEqual(colCount, iA.numCols)
def test_0_NA_2enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 30, '0', 'cC', 100), (100, 30, '0.0', 'cC', 100), (100, 30, '0.0000000', 'cC', 100), ] for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here # assert len(expected) == 6 # FIX! add expected and maxDelta? co = h2o_cmd.runSummary(key=hex_key, column=0) print co.label, co.type, co.missing, co.domain, sum(co.bins) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] for k, v in co: print k, v if DO_REBALANCE: print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % hex_key start = time.time() print "Rebalancing %s to %s with %s chunks" % ( hex_key, rb_key, REBALANCE_CHUNKS) rebalanceResult = h2o.nodes[0].rebalance( source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds' else: rb_key = hex_key print "Now doing to_enum across all columns of %s" % hex_key for column_index in range(colCount): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index + 1) # print "\nto_enum result:", h2o.dump_json(result) co = h2o_cmd.runSummary(key=hex_key, column=column_index + 1) print co.label, co.type, co.missing, co.domain, sum(co.bins) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] if co.type != 'Enum': raise Exception( "column %s, which has name %s, didn't convert to Enum, is %s" % (column_index, colname, co.type)) # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1 if co.missing <= 0 or co.missing > rowCount: raise Exception( "column %s, which has name %s, somehow got NA cnt wrong after convert to Enum %s %s" % (column_index, colname, co.missing, rowCount)) if co.domain != 1: # NAs don't count? # print "stats:", h2o.dump_json(stats) print "column:", h2o.dump_json(co) raise Exception( "column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, co.label, domain))