def test_parse_summary_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' timeoutSecs = 300 csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary) csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' validation_key = 'test.hex' parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary)
def test_parse_summary_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' timeoutSecs = 300 csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(rSummary, rows=numRows, cols=numCols) csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' validation_key = 'test.hex' parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary)
def test_exec2_runif(self): print "h2o syntax is not full R. Doesn't take min/max interval params. assumed 0/1 interval" print " just one param, it must be a column or row vector. Result is same length" print " R allows a scalar to be param" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) 'r0.hex = r.hex[,1]', 's0.hex = runif(r.hex[,1],-1)', 's1.hex = runif(r.hex[,1],-1)', 's2.hex = runif(r.hex[,1],-1)', # error. this causes exception # 's3.hex = runif(nrow(r.hex), -1)', ] results = [] for execExpr in execExprList: start = time.time() (resultExec, result) = h2e.exec_expr( execExpr=execExpr, timeoutSecs=30) # unneeded but interesting results.append(result) print "exec end on ", "operators", 'took', time.time( ) - start, 'seconds' print "exec result:", result print "exec result (full):", h2o.dump_json(resultExec) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1 == 0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [ 0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0 ])
def test_many_fp_formats(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (100, 100, 'cB', 180), (100000, 10, 'cA', 180), # (100, 900, 'cC', 30), # (100, 500, 'cD', 30), # (100, 100, 'cE', 30), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: NUM_CASES = h2o_util.fp_format() print "Will do %s" % NUM_CASES for sel in range(NUM_CASES): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) hex_key = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) h2o_cmd.runSummary(key=parseResult['destination_key'], max_qbins=100) print "Parse result['destination_key']:", hex_key inspect = h2o_cmd.runInspect(None, hex_key) print "Removing", hex_key h2o.nodes[0].remove_key(hex_key)
def test_many_fp_formats(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (100, 100, 'cB', 180), (100000, 10, 'cA', 180), # (100, 900, 'cC', 30), # (100, 500, 'cD', 30), # (100, 100, 'cE', 30), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: NUM_CASES = h2o_util.fp_format() print "Will do %s" % NUM_CASES for sel in range(NUM_CASES): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) hex_key = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) h2o_cmd.runSummary(key=parseResult['destination_key'], max_qbins=100) print "Parse result['destination_key']:", hex_key inspect = h2o_cmd.runInspect(None, hex_key) print "Removing", hex_key h2o.nodes[0].remove_key(hex_key)
def test_exec2_runif(self): print "in h2o-dev, params are column, min, max, seed" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) # params for h2o-dev runif are: column, min, max, seed AssignObj('r0.hex', KeyIndexed('r.hex', col=0)), AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1)), AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1)), AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)), ] results = [] for execExpr in execExprList: start = time.time() result = execExpr.do(timeoutSecs=30) results.append(result) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1 == 0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [ 0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0 ])
def test_create_frame_rand1(self): h2o.beta_features = True # default params = {'rows': 1, 'cols': 1} for trial in range(20): h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strict checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 kwargs = params.copy() print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') h2o_cmd.runSummary(key='temp1000.hex', timeoutSecs=300) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_exec2_runif(self): h2o.beta_features = True print "h2o syntax is not full R. Doesn't take min/max interval params. assumed 0/1 interval" print " just one param, it must be a column or row vector. Result is same length" print " R allows a scalar to be param" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) 'r0.hex = r.hex[,1]', 's0.hex = runif(r.hex[,1],-1)', 's1.hex = runif(r.hex[,1],-1)', 's2.hex = runif(r.hex[,1],-1)', # error. this causes exception # 's3.hex = runif(nrow(r.hex), -1)', ] results = [] for execExpr in execExprList: start = time.time() (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # unneeded but interesting results.append(result) print "exec end on ", "operators" , 'took', time.time() - start, 'seconds' print "exec result:", result print "exec result (full):", h2o.dump_json(resultExec) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1==0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0])
def test_put_parse4(self): timeoutSecs = 10 trial = 1 n = h2o.nodes[0] for x in xrange (2): print 'Trial:', trial csvPathname = 'iris/iris_wheader.csv.gz' hex_key = "iris" + "_" + str(x) + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, doSummary=False, schema='put') h2o_cmd.runSummary(key=hex_key) trial += 1
def test_put_parse4(self): timeoutSecs = 10 trial = 1 n = h2o.nodes[0] for x in xrange (2): print 'Trial:', trial # csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") csvPathname = h2o.find_file('smalldata/iris/iris_wheader.csv.gz') key2 = "iris" + "_" + str(x) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, doSummary=False) h2o_cmd.runSummary(key=key2, doPrint=True) trial += 1
def test_create_frame_rand1(self): h2o.beta_features = True # default params = { 'rows': 1, 'cols': 1 } for trial in range(20): h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strick checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 kwargs = params.copy() print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') h2o_cmd.runSummary(key='temp1000.hex') print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_rf_airlines_2013_fvec(self): h2o.beta_features = True h2b.browseTheCloud() csvFilename = 'year2013.csv' hex_key = 'year2013.hex' importFolderPath = 'airlines' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False) parse_time = time.time() - start print "parse took {0} sec".format(parse_time) start = time.time() start = time.time() # noise=['JStack','cpu','disk']) h2o_cmd.runSummary(key=hex_key, timeoutSecs=200) elapsed = time.time() - start print "summary took {0} sec".format(elapsed) trees = 10 paramsTrainRF = { 'ntrees': trees, 'max_depth': 20, 'nbins': 200, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', 'timeoutSecs': 14800, } kwargs = paramsTrainRF.copy() start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n" + l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_exec2_runif(self): print "in h2o-dev, params are column, min, max, seed" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) # params for h2o-dev runif are: column, min, max, seed AssignObj('r0.hex', KeyIndexed('r.hex',col=0) ), AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1) ), AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1) ), AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1) ), ] results = [] for execExpr in execExprList: start = time.time() result = execExpr.do(timeoutSecs=30) results.append(result) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1==0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0])
def test_put_parse4(self): timeoutSecs = 10 trial = 1 n = h2o.nodes[0] for x in xrange(2): print 'Trial:', trial # csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") csvPathname = h2o.find_file('smalldata/iris/iris_wheader.csv.gz') key2 = "iris" + "_" + str(x) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, doSummary=False) h2o_cmd.runSummary(key=key2, doPrint=True) trial += 1
def test_put_parse4(self): timeoutSecs = 10 trial = 1 n = h2o.nodes[0] for x in xrange(2): print 'Trial:', trial csvPathname = 'iris/iris_wheader.csv.gz' hex_key = "iris" + "_" + str(x) + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, doSummary=False, schema='put') h2o_cmd.runSummary(key=hex_key) trial += 1
def test_parse_covtype_2(self): tryList = [ ('covtype.data', 1, 30), # ('covtype20x.data', 20, 120), ] for (csvFilename, multiplyExpected, timeoutSecs) in tryList: for trial in range(16,24): # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv")) importFolderPath = "standard" hex_key = 'covtype.hex' csvPathname = importFolderPath + "/" + csvFilename chunk_size = 2**trial print "Trial %s. Trying chunk_size %s (power of 2)" % (trial, chunk_size) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, hex_key=hex_key, chunk_size=chunk_size, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(1): co = h2o_cmd.runSummary(key=hex_key, column=i) k = parseResult['frames'][0]['frame_id']['name'] # print "parseResult:", dump_json(parseResult) a_node = h2o.nodes[0] frames_result = a_node.frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) parseKeyIndexedCheck(frames_result, multiplyExpected)
def test_parse_nfs(self): print "run as user 0xcustomer on machine with nfs /mnt/0xcustomer-datasets/c1" tryList = [ ('iris2.csv', 'iris2.hex', 1, 30), ] for (csvFilename, hex_key, multiplyExpected, timeoutSecs) in tryList: importFolderPath = "/mnt/0xcustomer-datasets/c1" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=timeoutSecs, hex_key=hex_key, chunk_size=4194304/2, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=150*multiplyExpected, expectedNumCols=5, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(0): print "Summary on column", i co = h2o_cmd.runSummary(key=hex_key, column=i) k = parseResult['frames'][0]['frame_id']['name'] frames_result = h2o.nodes[0].frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) parseKeyIndexedCheck(frames_result, multiplyExpected)
def test_from_import_fvec(self): csvFilenameAll = [ ("covtype.data", 500), # ("covtype20x.data", 1000), ] for (csvFilename, timeoutSecs) in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True) h2o_cmd.infoFromInspect(inspect, parseResult['destination_key']) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) # h2o_cmd.infoFromSummary(summaryResult) trees = 2 start = time.time() rfView = h2o_cmd.runRF(trees=trees, max_depth=20, balance_classes=0, importance=1, parseResult=parseResult, timeoutSecs=timeoutSecs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n"+l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_c7_rel(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=True) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 if DO_GLM: start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False)
def test_NOPASS_create_frame_fail(self): h2o.beta_features = True for trial in range(20): kwargs = {'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0} print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10) h2o_cmd.infoFromSummary(rSummary) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def getSummaries(): with open('./smalldata.csv', 'rb') as f: for line in f: PATHS = [] NAMES = [] NUMCOLS = 0 NUMROWS = 0 TYPES = [] RANGES = [] IGNORED = 'NA' TARGET = 'NA' DATANAME, uploadPath, importPath, importHDFS, fullPath = line.strip("\n").split(',') PATHS = [uploadPath, importPath, importHDFS] bucket = 'smalldata' path = '/'.join(importPath.split('/')[2:]).strip('"') parseResult = h2i.import_parse(bucket=bucket, path = path, schema='local', doSummary = False) summary = h2o_cmd.runSummary(key=parseResult['destination_key']) columns = summary['summary']['columns'] NUMCOLS = len(columns) NUMROWS = columns[0]['N'] for col in columns: NAMES.append( '\"' + col['name'] + '\"') TYPES.append('\"' + col['type'] + '\"') tup = '(' + '"' + str(min(col['min'])) + '"' + ',' + '"' + str(max(col['max'])) + '"' + ')' if col['type'] == 'number' else '("NA", "NA")' RANGES += [tup] toJson(DATANAME, PATHS, NAMES, NUMCOLS, NUMROWS, TYPES, RANGES, IGNORED = "NA", TARGET = "NA")
def test_NOPASS_exec2_empty_result(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() for execExpr in exprList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) rSummary = h2o_cmd.runSummary(key="a") h2o_cmd.infoFromSummary(rSummary) h2o.check_sandbox_for_errors() print "exec end on ", "operators", 'took', time.time( ) - start, 'seconds'
def test_parse_covtype_2(self): tryList = [ ('covtype.data', 1, 30), ('covtype20x.data', 20, 120), ] for (csvFilename, multiplyExpected, timeoutSecs) in tryList: # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv")) importFolderPath = "standard" hex_key = 'covtype.hex' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, hex_key=hex_key, chunk_size=4194304*2, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(1): print "Summary on column", i co = h2o_cmd.runSummary(key=hex_key, column=i) k = parseResult['frames'][0]['key']['name'] # print "parseResult:", dump_json(parseResult) a_node = h2o.nodes[0] frames_result = a_node.frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) parseKeyIndexedCheck(frames_result, multiplyExpected)
def test_parse_nfs(self): print "run as user 0xcustomer on machine with nfs /mnt/0xcustomer-datasets/c1" tryList = [ ('iris2.csv', 'iris2.hex', 1, 30), ] for (csvFilename, hex_key, multiplyExpected, timeoutSecs) in tryList: importFolderPath = "/mnt/0xcustomer-datasets/c1" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=timeoutSecs, hex_key=hex_key, chunk_size=4194304 / 2, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=150 * multiplyExpected, expectedNumCols=5, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(0): print "Summary on column", i co = h2o_cmd.runSummary(key=hex_key, column=i) k = parseResult['frames'][0]['frame_id']['name'] frames_result = h2o.nodes[0].frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) parseKeyIndexedCheck(frames_result, multiplyExpected)
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs) print h2o.dump_json(importResult) storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30) # print h2o.dump_json(storeViewResult) hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def do_summary_and_inspect(): # SUMMARY****************************************** summaryResult = h2o_cmd.runSummary(key=hex_key) coltypeList = h2o_cmd.infoFromSummary(summaryResult) # INSPECT****************************************** inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # Now check both inspect and summary if csvFilename=='covtype.binary.svm': for k in range(55): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0)) stype = inspect['cols'][k]['type'] print k, stype self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int')) # summary may report type differently than inspect..check it too! # we could check na here too for i,c in enumerate(coltypeList): print "column index: %s column type: %s" % (i, c) # inspect says 'int?" assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)
def test_summary_with_x_libsvm (self): h2o.beta_features = True print "Empty rows except for the last, with all zeros for class. Single col at max" h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 100, 'cA', 300), (100000, 100, 'cB', 300), (100, 1000, 'cC', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, numCols)) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) for x in range(numCols): print "Doing summary with x=%s" % x summaryResult = h2o_cmd.runSummary(key=hex_key, cols=x, timeoutSecs=timeoutSecs) # skip the infoFromSummary check colName = "C" + str(x+1) print "Doing summary with col name x=%s" % colName summaryResult = h2o_cmd.runSummary(key=hex_key, cols=colName, timeoutSecs=timeoutSecs) # do a final one with all columns for the current check below # FIX! we should update the check to check each individual summary result print "Doing and checking summary with no x=%s" % x summaryResult = h2o_cmd.runSummary(key=hex_key, max_ncols=colNumberMax+1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True)
def test_0_NA_2enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 30, '0', 'cC', 100), (100, 30, '0.0', 'cC', 100), (100, 30, '0.0000000', 'cC', 100), ] for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename if DO_REBALANCE: print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % hex_key start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds' else: rb_key = hex_key print "Now doing to_enum across all columns of %s" % hex_key for column_index in range(colCount): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1) # print "\nto_enum result:", h2o.dump_json(result) summaryResult = h2o_cmd.runSummary(key=hex_key) # check that it at least is an enum column now, with no na's # just look at the column we touched column = summaryResult['summaries'][column_index] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] cardinality = stats['cardinality'] if stattype != 'Enum': raise Exception("column %s, which has name %s, didn't convert to Enum, is %s %s" % (column_index, colname, stattype, coltype)) # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1 if nacnt<=0 or nacnt>rowCount: raise Exception("column %s, which has name %s, somehow got NA cnt wrong after convert to Enum %s %s" % (column_index, colname, nacnt, rowCount)) if cardinality!=1: # NAs don't count? # print "stats:", h2o.dump_json(stats) print "column:", h2o.dump_json(column) raise Exception("column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, colname, cardinality)) h2o_cmd.infoFromSummary(summaryResult)
def test_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() for trial in range(2): csvFilename = "syn_ints.csv" hex_key = "1.hex" csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, trial) timeoutSecs = 10 # have to import each time, because h2o deletes source after parse # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000) parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000) # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360) print "Inspect:", hex_key, "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(summaryResult) if DO_KMEANS: # KMEANS****************************************** kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_frame_split_balance(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just split away and see if anything blows up" splitMe = hex_key inspect = h2o_cmd.runInspect(key=splitMe) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] for s in range(20): inspect = h2o_cmd.runInspect(key=splitMe) numRows = inspect['numRows'] numCols = inspect['numCols'] fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5) split0_key = fs['split_keys'][0] split1_key = fs['split_keys'][1] split0_rows = fs['split_rows'][0] split1_rows = fs['split_rows'][1] split0_ratio = fs['split_ratios'][0] split1_ratio = fs['split_ratios'][1] print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split1_key # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols) if split0_rows <= 2: break print "Now do some rebalancing on the split frames" for trial in range(2): rb_key = "rb_%s_%s" % (trial, splitMe) SEEDPERFILE = random.randint(0, sys.maxint) randChunks = random.randint(1, 100) start = time.time() print "Trial %s: Rebalancing %s to %s with %s chunks" % ( trial, splitMe, rb_key, randChunks) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, seed=SEEDPERFILE, chunks=randChunks) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\ h2o_cmd.runSummary(key=rb_key) print "\nInspecting the original parsed result" inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect=inspect) print "\nInspecting the rebalanced result with %s forced chunks" % randChunks inspect = h2o_cmd.runInspect(key=rb_key) h2o_cmd.infoFromInspect(inspect=inspect)
def test_c7_rel(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + csvFilename, " num_rows:", "{:,}".format(num_rows), " num_cols:", "{:,}".format(num_cols) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=num_cols, numRows=num_rows, max_column_display=2500) # it's in runSummary! # h2o_cmd.infoFromSummary(summaryResult, noPrint=False, numCols=num_cols, numRows=num_rows) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 if DO_GLM: start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_create_frame_rand1(self): h2o.beta_features = True # default params = { 'rows': 1, 'cols': 1 } for trial in range(10): h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', 0) c = params.get('categorical_fraction', 0) r = params.get('randomize', 0) v = params.get('value', None) if r: if v is not None: # if these are None, they are treated as >0 (default > 0?) params['integer_fraction'] = 0 params['categorical_fraction'] = 0 elif (i and c) and (i + c) >= 1.0: params['integer_fraction'] = i params['categorical_fraction'] = 1.0 - i else: params['integer_fraction'] = 0 params['categorical_fraction'] = 0 params['value'] = None kwargs = params.copy() print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') h2o_cmd.runSummary(key='temp1000.hex') print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: model_key = kmeans["model"]["_selfKey"] # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame # can't use inspect on a model key? now? kmeansResult = kmeans model = kmeansResult["model"] centers = model["clusters"] error = model["error"] else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) model = kmeansResult["KMeansModel"] centers = model["clusters"] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey ) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult["summaries"][0]["hcnt"] # histogram rows_per_cluster = hcnt # have to figure out how to get this with fvec sqr_error_per_cluster = [0 for h in hcnt] else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey ) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key) score = kmeansScoreResult["score"] rows_per_cluster = score["rows_per_cluster"] sqr_error_per_cluster = score["sqr_error_per_cluster"] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", centers[i] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: # can't use inspect on a model key? now? model = kmeans["model"] model_key = model["_key"] centers = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] kmeansResult = kmeans else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) h2o.verboseprint("kmeans result:", h2o.dump_json(kmeansResult)) model = kmeansResult["KMeansModel"] centers = model["clusters"] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey ) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult["summaries"][0]["hcnt"] # histogram rows_per_cluster = hcnt # FIX! does the cluster order/naming match, compared to cluster variances sqr_error_per_cluster = cluster_variances else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey ) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key) score = kmeansScoreResult["score"] rows_per_cluster = score["rows_per_cluster"] sqr_error_per_cluster = score["sqr_error_per_cluster"] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def test_rf_airlines_2013_fvec(self): h2o.beta_features = True h2b.browseTheCloud() csvFilename = 'year2013.csv' hex_key = 'year2013.hex' importFolderPath = 'airlines' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False) parse_time = time.time() - start print "parse took {0} sec".format(parse_time) start = time.time() start = time.time() # noise=['JStack','cpu','disk']) h2o_cmd.runSummary(key=hex_key, timeoutSecs=200) elapsed = time.time() - start print "summary took {0} sec".format(elapsed) trees = 10 paramsTrainRF = { 'ntrees': trees, 'max_depth': 20, 'nbins': 200, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', 'timeoutSecs': 14800, } kwargs = paramsTrainRF.copy() start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n"+l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_parse_summary_manyfiles_s3_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [("manyfiles-nflx-gz", 800)] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: # change to 50 files csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs ) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, ) elapsed = time.time() - start print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_storeview_import(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_airline_s3(self): h2o.beta_features = True csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_65k_cols_01_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 63000, 'cH', 100), (10, 65000, 'cH', 100), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Summary should work with 65k" start = time.time() h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=300) print "Summary", parseResult['destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount))
def test_parse_summary_airline_s3(self): csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_zip_s3_fvec(self): h2o.beta_features = True csvFilelist = [ ("test_set.zip", 300), # 110.9MB ("train_set.zip", 600), # 362.9MB ] (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3') print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** csvPathname = "allstate/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_parse_65k_cols_01_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 63000, 'cH', 100), (10, 65000, 'cH', 100), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Summary should work with 65k" start = time.time() h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=300) print "Summary", parseResult['destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount))
def test_parse_summary_manyfiles_1_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_1.dat.gz" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 542) self.assertEqual(numRows, 100000) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** # pass numRows, so we know when na cnt means row is all na's summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360, numCols=numCols, numRows=numRows) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() for trial in range(2): csvFilename = "syn_ints.csv" hex_key = "1.hex" csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, trial) timeoutSecs = 10 # have to import each time, because h2o deletes source after parse # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000) parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000) # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360) print "Inspect:", hex_key, "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(summaryResult) if DO_KMEANS: # KMEANS****************************************** kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_frame_split_balance(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just split away and see if anything blows up" splitMe = hex_key inspect = h2o_cmd.runInspect(key=splitMe) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] for s in range(20): inspect = h2o_cmd.runInspect(key=splitMe) numRows = inspect['numRows'] numCols = inspect['numCols'] fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5) split0_key = fs['split_keys'][0] split1_key = fs['split_keys'][1] split0_rows = fs['split_rows'][0] split1_rows = fs['split_rows'][1] split0_ratio = fs['split_ratios'][0] split1_ratio = fs['split_ratios'][1] print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split1_key # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols) if split0_rows <= 2: break print "Now do some rebalancing on the split frames" for trial in range(2): rb_key = "rb_%s_%s" % (trial, splitMe) SEEDPERFILE = random.randint(0, sys.maxint) randChunks = random.randint(1, 100) start = time.time() print "Trial %s: Rebalancing %s to %s with %s chunks" % (trial, splitMe, rb_key, randChunks) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, seed=SEEDPERFILE, chunks=randChunks) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\ h2o_cmd.runSummary(key=rb_key) print "\nInspecting the original parsed result" inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect=inspect) print "\nInspecting the rebalanced result with %s forced chunks" % randChunks inspect = h2o_cmd.runInspect(key=rb_key) h2o_cmd.infoFromInspect(inspect=inspect)
def test_insert_na(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just insert some NAs and see what happens" inspect = h2o_cmd.runInspect(key=hex_key) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] missing_fraction = 0.1 # every iteration, we add 0.1 more from the unmarked to the marked (missing) expectedMissing = missing_fraction * origNumRows # per col for trial in range(2): fs = h2o.nodes[0].insert_missing_values( key=hex_key, missing_fraction=missing_fraction, seed=SEED) print "fs", h2o.dump_json(fs) inspect = h2o_cmd.runInspect(key=hex_key) numRows = inspect['numRows'] numCols = inspect['numCols'] expected = .1 * numRows # Each column should get .10 random NAs per iteration. Within 10%? missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList", missingValuesList for mv in missingValuesList: # h2o_util.assertApproxEqual(mv, expectedMissing, tol=0.01, msg='mv %s is not approx. expected %s' % (mv, expectedMissing)) self.assertAlmostEqual(mv, expectedMissing, delta=0.1 * mv, msg='mv %s is not approx. expected %s' % (mv, expectedMissing)) self.assertEqual(origNumRows, numRows) self.assertEqual(origNumCols, numCols) summaryResult = h2o_cmd.runSummary(key=hex_key) # h2o_cmd.infoFromSummary(summaryResult) print "trial", trial print "expectedMissing:", expectedMissing print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect"
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_rebalance_int2enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 30, 'cC', 100), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=20) hex_key=parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=hex_key) print "\n" + csvFilename print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % (hex_key) start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\ print "Now doing to_enum across all columns of %s" % hex_key for column_index in range(colCount): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1) # print "\nto_enum result:", h2o.dump_json(result) summaryResult = h2o_cmd.runSummary(key=hex_key) # check that it at least is an enum column now, with no na's # just look at the column we touched column = summaryResult['summaries'][column_index] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] cardinality = stats['cardinality'] if stattype != 'Enum': raise Exception("column %s, which has name %s, didn't convert to Enum, is %s %s" (column_index, colname, stattype, coltype)) if nacnt!=0: raise Exception("column %s, which has name %s, somehow got NAs after convert to Enum %s" (column_index, colname, nacnt)) if cardinality!=4: raise Exception("column %s, which has name %s, should have cardinality 4, got: %s" (column_index, colname, cardinality)) h2o_cmd.infoFromSummary(summaryResult)
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_NOPASS_exec2_empty_result(self): bucket = "smalldata" csvPathname = "iris/iris2.csv" hexKey = "i.hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() for execExpr in exprList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) rSummary = h2o_cmd.runSummary(key="a") h2o_cmd.infoFromSummary(rSummary) h2o.check_sandbox_for_errors() print "exec end on ", "operators", "took", time.time() - start, "seconds"
def test_parse_mnist_rebalance(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_training.csv.gz", 600), ("mnist_training.csv.gz", 600), ("mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", 600), ] trial = 0 allDelta = [] for (csvFilename, timeoutSecs) in csvFilelist: hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilename, hex_key=hex_key, retryDelaySecs=1, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "\n#******************************************************************************" for trial in range(1): rb_key = "rb_%s_%s" % (trial, hex_key) SEEDPERFILE = random.randint(0, sys.maxint) randChunks = random.randint(1, 100) start = time.time() print "Trial %s: Rebalancing %s to %s with %s chunks" % ( trial, hex_key, rb_key, randChunks) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, seed=SEEDPERFILE, chunks=randChunks) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\ h2o_cmd.runSummary(key=rb_key, timeoutSecs=timeoutSecs) print "\nInspecting the original parsed result" inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect=inspect) print "\nInspecting the rebalanced result with %s forced chunks" % randChunks inspect = h2o_cmd.runInspect(key=rb_key) h2o_cmd.infoFromInspect(inspect=inspect)
def test_NOPASS_create_frame_fail(self): h2o.beta_features = True for trial in range(20): kwargs = { 'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0 } print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10) h2o_cmd.infoFromSummary(rSummary) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_parse_covtype(self): tryList = [ ('covtype.data', 1, 30), ('covtype20x.data', 20, 120), ] for (csvFilename, multiplyExpected, timeoutSecs) in tryList: # h2o-dev doesn't take ../.. type paths? make find_file return absolute pathj a_node = h2o.nodes[0] importFolderPath = os.path.expanduser( "~/home-0xdiag-datasets/standard") csvPathname = importFolderPath + "/" + csvFilename importResult = a_node.import_files(path=csvPathname) # print "importResult:", dump_json(importResult) hex_key = importResult['destination_frames'][0] if CAUSE_FAIL: frames_result = a_node.frames(key=k, row_count=5, timeoutSecs=timeoutSecs) # print "frames_result from the first importResult key", dump_json(frames_result) parseResult = a_node.parse(key=hex_key, timeoutSecs=timeoutSecs, chunk_size=4194304 * 4) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=581012 * multiplyExpected, expectedNumCols=55, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(0): print "Summary on column", i co = h2o_cmd.runSummary(key=hex_key, column=i) k = parseResult['frames'][0]['frame_id']['name'] # print "parseResult:", dump_json(parseResult) frames_result = a_node.frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) parseKeyIndexedCheck(frames_result, multiplyExpected)
def getSummaries(): with open('./smalldata.csv', 'rb') as f: for line in f: PATHS = [] NAMES = [] NUMCOLS = 0 NUMROWS = 0 TYPES = [] RANGES = [] IGNORED = 'NA' TARGET = 'NA' DATANAME, uploadPath, importPath, importHDFS, fullPath = line.strip( "\n").split(',') PATHS = [uploadPath, importPath, importHDFS] bucket = 'smalldata' path = '/'.join(importPath.split('/')[2:]).strip('"') parseResult = h2i.import_parse(bucket=bucket, path=path, schema='local', doSummary=False) summary = h2o_cmd.runSummary(key=parseResult['destination_key']) columns = summary['summary']['columns'] NUMCOLS = len(columns) NUMROWS = columns[0]['N'] for col in columns: NAMES.append('\"' + col['name'] + '\"') TYPES.append('\"' + col['type'] + '\"') tup = '(' + '"' + str(min(col['min'])) + '"' + ',' + '"' + str( max(col['max']) ) + '"' + ')' if col['type'] == 'number' else '("NA", "NA")' RANGES += [tup] toJson(DATANAME, PATHS, NAMES, NUMCOLS, NUMROWS, TYPES, RANGES, IGNORED="NA", TARGET="NA")
def test_speedrf_covtype_fvec(self): importFolderPath = "standard" # Parse Train ****************************************************** # csvTrainFilename = 'covtype.data' csvTrainFilename = 'covtype20x.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) xList = [] eList = [] fList = [] trial = 0 for trial in range(10): timeoutSecs = 30 # have unique model names start = time.time() summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print 'summary end', trial, 'on', csvTrainPathname, 'took', elapsed, 'seconds' fList.append(elapsed) eList.append(elapsed) if DO_PLOT: xLabel = 'trial' xList.append(trial) if DO_PLOT: eLabel = 'elapsed' fLabel = 'elapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def do_summary_and_inspect(): # SUMMARY****************************************** summaryResult = h2o_cmd.runSummary(key=hex_key) coltypeList = h2o_cmd.infoFromSummary(summaryResult) # INSPECT****************************************** inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # Now check both inspect and summary if csvFilename == 'covtype.binary.svm': for k in range(55): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0)) stype = inspect['cols'][k]['type'] print k, stype self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int')) # summary may report type differently than inspect..check it too! # we could check na here too for i, c in enumerate(coltypeList): print "column index: %s column type: %s" % (i, c) # inspect says 'int?" assert c == 'Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % ( i, c)
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: # can't use inspect on a model key? now? model = kmeans['model'] model_key = model['_key'] centers = model['centers'] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] kmeansResult = kmeans else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) h2o.verboseprint('kmeans result:', h2o.dump_json(kmeansResult)) model = kmeansResult['KMeansModel'] centers = model['clusters'] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult['destination_key'], model_key=model_key, destination_key=predictKey) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult['summaries'][0]['hcnt'] # histogram rows_per_cluster = hcnt # FIX! does the cluster order/naming match, compared to cluster variances sqr_error_per_cluster = cluster_variances else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult['destination_key'], model_key=model_key, destination_key=applyDestinationKey) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score( key=parseResult['destination_key'], model_key=model_key) score = kmeansScoreResult['score'] rows_per_cluster = score['rows_per_cluster'] sqr_error_per_cluster = score['sqr_error_per_cluster'] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str( i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def test_c5_KMeans_sphere_67MB_fvec(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 67306997 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # clear out all NAs (walk across cols)..clear to 0 # temp ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key) ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500) h2o_cmd.infoFromInspect(inspect, csvPathname) summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 10, 'normalize': 1, 'initialization': 'Furthest', 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work } if (trial%3)==0: kwargs['initialization'] = 'PlusPlus' elif (trial%3)==1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeans) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial) h2i.delete_keys_at_all_nodes()