def test_NOPASS_create_frame_fail(self): h2o.beta_features = True for trial in range(20): kwargs = {'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0} print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10) h2o_cmd.infoFromSummary(rSummary) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_parse_summary_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' timeoutSecs = 300 csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(rSummary, rows=numRows, cols=numCols) csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' validation_key = 'test.hex' parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary)
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs) print h2o.dump_json(importResult) storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30) # print h2o.dump_json(storeViewResult) hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' timeoutSecs = 300 csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary) csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' validation_key = 'test.hex' parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary)
def test_c7_rel(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=True) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 if DO_GLM: start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False)
def test_NOPASS_exec2_empty_result(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() for execExpr in exprList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) rSummary = h2o_cmd.runSummary(key="a") h2o_cmd.infoFromSummary(rSummary) h2o.check_sandbox_for_errors() print "exec end on ", "operators", 'took', time.time( ) - start, 'seconds'
def test_0_NA_2enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 30, '0', 'cC', 100), (100, 30, '0.0', 'cC', 100), (100, 30, '0.0000000', 'cC', 100), ] for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename if DO_REBALANCE: print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % hex_key start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds' else: rb_key = hex_key print "Now doing to_enum across all columns of %s" % hex_key for column_index in range(colCount): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1) # print "\nto_enum result:", h2o.dump_json(result) summaryResult = h2o_cmd.runSummary(key=hex_key) # check that it at least is an enum column now, with no na's # just look at the column we touched column = summaryResult['summaries'][column_index] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] cardinality = stats['cardinality'] if stattype != 'Enum': raise Exception("column %s, which has name %s, didn't convert to Enum, is %s %s" % (column_index, colname, stattype, coltype)) # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1 if nacnt<=0 or nacnt>rowCount: raise Exception("column %s, which has name %s, somehow got NA cnt wrong after convert to Enum %s %s" % (column_index, colname, nacnt, rowCount)) if cardinality!=1: # NAs don't count? # print "stats:", h2o.dump_json(stats) print "column:", h2o.dump_json(column) raise Exception("column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, colname, cardinality)) h2o_cmd.infoFromSummary(summaryResult)
def test_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() for trial in range(2): csvFilename = "syn_ints.csv" hex_key = "1.hex" csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, trial) timeoutSecs = 10 # have to import each time, because h2o deletes source after parse # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000) parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000) # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360) print "Inspect:", hex_key, "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(summaryResult) if DO_KMEANS: # KMEANS****************************************** kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: # can't use inspect on a model key? now? model = kmeans["model"] model_key = model["_key"] centers = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] kmeansResult = kmeans else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) h2o.verboseprint("kmeans result:", h2o.dump_json(kmeansResult)) model = kmeansResult["KMeansModel"] centers = model["clusters"] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey ) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult["summaries"][0]["hcnt"] # histogram rows_per_cluster = hcnt # FIX! does the cluster order/naming match, compared to cluster variances sqr_error_per_cluster = cluster_variances else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey ) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key) score = kmeansScoreResult["score"] rows_per_cluster = score["rows_per_cluster"] sqr_error_per_cluster = score["sqr_error_per_cluster"] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: model_key = kmeans["model"]["_selfKey"] # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame # can't use inspect on a model key? now? kmeansResult = kmeans model = kmeansResult["model"] centers = model["clusters"] error = model["error"] else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) model = kmeansResult["KMeansModel"] centers = model["clusters"] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey ) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult["summaries"][0]["hcnt"] # histogram rows_per_cluster = hcnt # have to figure out how to get this with fvec sqr_error_per_cluster = [0 for h in hcnt] else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey ) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key) score = kmeansScoreResult["score"] rows_per_cluster = score["rows_per_cluster"] sqr_error_per_cluster = score["sqr_error_per_cluster"] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", centers[i] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def test_parse_summary_manyfiles_s3_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [("manyfiles-nflx-gz", 800)] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: # change to 50 files csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs ) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, ) elapsed = time.time() - start print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_storeview_import(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_airline_s3(self): h2o.beta_features = True csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_zip_s3_fvec(self): h2o.beta_features = True csvFilelist = [ ("test_set.zip", 300), # 110.9MB ("train_set.zip", 600), # 362.9MB ] (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3') print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** csvPathname = "allstate/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_parse_summary_airline_s3(self): csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() for trial in range(2): csvFilename = "syn_ints.csv" hex_key = "1.hex" csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, trial) timeoutSecs = 10 # have to import each time, because h2o deletes source after parse # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000) parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000) # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360) print "Inspect:", hex_key, "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(summaryResult) if DO_KMEANS: # KMEANS****************************************** kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_rebalance_int2enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 30, 'cC', 100), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=20) hex_key=parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=hex_key) print "\n" + csvFilename print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % (hex_key) start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\ print "Now doing to_enum across all columns of %s" % hex_key for column_index in range(colCount): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1) # print "\nto_enum result:", h2o.dump_json(result) summaryResult = h2o_cmd.runSummary(key=hex_key) # check that it at least is an enum column now, with no na's # just look at the column we touched column = summaryResult['summaries'][column_index] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] cardinality = stats['cardinality'] if stattype != 'Enum': raise Exception("column %s, which has name %s, didn't convert to Enum, is %s %s" (column_index, colname, stattype, coltype)) if nacnt!=0: raise Exception("column %s, which has name %s, somehow got NAs after convert to Enum %s" (column_index, colname, nacnt)) if cardinality!=4: raise Exception("column %s, which has name %s, should have cardinality 4, got: %s" (column_index, colname, cardinality)) h2o_cmd.infoFromSummary(summaryResult)
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def do_summary_and_inspect(): # SUMMARY****************************************** summaryResult = h2o_cmd.runSummary(key=hex_key) coltypeList = h2o_cmd.infoFromSummary(summaryResult) # INSPECT****************************************** inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # Now check both inspect and summary if csvFilename=='covtype.binary.svm': for k in range(55): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0)) stype = inspect['cols'][k]['type'] print k, stype self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int')) # summary may report type differently than inspect..check it too! # we could check na here too for i,c in enumerate(coltypeList): print "column index: %s column type: %s" % (i, c) # inspect says 'int?" assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)
def test_summary_with_x_libsvm (self): h2o.beta_features = True print "Empty rows except for the last, with all zeros for class. Single col at max" h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 100, 'cA', 300), (100000, 100, 'cB', 300), (100, 1000, 'cC', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, numCols)) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) for x in range(numCols): print "Doing summary with x=%s" % x summaryResult = h2o_cmd.runSummary(key=hex_key, cols=x, timeoutSecs=timeoutSecs) # skip the infoFromSummary check colName = "C" + str(x+1) print "Doing summary with col name x=%s" % colName summaryResult = h2o_cmd.runSummary(key=hex_key, cols=colName, timeoutSecs=timeoutSecs) # do a final one with all columns for the current check below # FIX! we should update the check to check each individual summary result print "Doing and checking summary with no x=%s" % x summaryResult = h2o_cmd.runSummary(key=hex_key, max_ncols=colNumberMax+1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True)
def test_NOPASS_exec2_empty_result(self): bucket = "smalldata" csvPathname = "iris/iris2.csv" hexKey = "i.hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() for execExpr in exprList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) rSummary = h2o_cmd.runSummary(key="a") h2o_cmd.infoFromSummary(rSummary) h2o.check_sandbox_for_errors() print "exec end on ", "operators", "took", time.time() - start, "seconds"
def test_NOPASS_create_frame_fail(self): h2o.beta_features = True for trial in range(20): kwargs = { 'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0 } print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10) h2o_cmd.infoFromSummary(rSummary) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_NOPASS_create_frame_fail(self): h2o.beta_features = True for trial in range(20): kwargs = { "integer_range": None, "missing_fraction": 0.1, "cols": 10, "response_factors": 1, "seed": 1234, "randomize": 1, "categorical_fraction": 0, "rows": 1, "factors": 0, "real_range": 0, "value": None, "integer_fraction": 0, } print kwargs timeoutSecs = 300 parseResult = h2i.import_parse( bucket="smalldata", path="poker/poker1000", hex_key="temp1000.hex", schema="put", timeoutSecs=timeoutSecs, ) cfResult = h2o.nodes[0].create_frame(key="temp1000.hex", timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + "/" + "temp1000.csv" h2o.nodes[0].csv_download(src_key="temp1000.hex", csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key="temp1000.hex") rSummary = h2o_cmd.runSummary(key="temp1000.hex", cols=10) h2o_cmd.infoFromSummary(rSummary) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_parse_summary_zip_s3_fvec(self): h2o.beta_features = True csvFilelist = [ ("test_set.zip", 300), # 110.9MB ("train_set.zip", 600), # 362.9MB ] (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3') print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** csvPathname = "allstate/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_from_import_fvec(self): h2o.beta_features = True timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", ] for csvFilename in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local', hex_key=hex_key, timeoutSecs=500, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True) h2o_cmd.infoFromInspect(inspect, parseResult['destination_key']) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult) trees = 2 start = time.time() rfView = h2o_cmd.runRF(trees=trees, max_depth=20, parseResult=parseResult, timeoutSecs=timeoutSecs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n"+l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_from_import(self): h2o.beta_features = True timeoutSecs = 500 csvFilenameAll = ["covtype.data", "covtype20x.data"] # pop open a browser on the cloud # h2b.browseTheCloud() for csvFilename in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + ".hex" parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path="standard/" + csvFilename, schema="local", hex_key=hex_key, timeoutSecs=500, doSummary=False, ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], verbose=True) h2o_cmd.infoFromInspect(inspect, parseResult["destination_key"]) summaryResult = h2o_cmd.runSummary(key=parseResult["destination_key"]) h2o_cmd.infoFromSummary(summaryResult) if not h2o.beta_features: RFview = h2o_cmd.runRF(trees=1, depth=25, parseResult=parseResult, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def do_summary_and_inspect(): # SUMMARY****************************************** summaryResult = h2o_cmd.runSummary(key=hex_key) coltypeList = h2o_cmd.infoFromSummary(summaryResult) # INSPECT****************************************** inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # Now check both inspect and summary if csvFilename == 'covtype.binary.svm': for k in range(55): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0)) stype = inspect['cols'][k]['type'] print k, stype self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int')) # summary may report type differently than inspect..check it too! # we could check na here too for i, c in enumerate(coltypeList): print "column index: %s column type: %s" % (i, c) # inspect says 'int?" assert c == 'Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % ( i, c)
def test_many_fp_formats_libsvm(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30, 'sparse'), (100, 100, 'cF', 30, 'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use numCols?. numCols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols)) syn = {} if k == 0: syn['name'] = "C1" syn['type'] = {'Int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn['name'] = "C2" syn['type'] = {'Int'} syn['min'] = 0 syn['max'] = 0 # syn['scale'] = {1} else: syn['name'] = "C" + str(k + 1) syn['type'] = {'Int', 'Real'} syn['min'] = valMin syn['max'] = valMax # syn['scale'] = {1,10,100,1000} syn['naCnt'] = 0 syn['cardinality'] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue( syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue( syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'type': if cols[synKey] not in syn[synKey]: print "cols min/max:", cols['min'], cols['max'] print "syn min/max:", syn['min'], syn['max'] raise Exception( 'col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual( syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_summary(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (500000, 1, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 for (rowCount, colCount, key2, timeoutSecs, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname legalValues = {} for x in range(expectedMin, expectedMax): legalValues[x] = x write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10, doSummary=False) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename summaryResult = h2o_cmd.runSummary(key=key2) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # remove bin_names because it's too big (256?) and bins # just touch all the stuff returned summary = summaryResult['summary'] columnsList = summary['columns'] for columns in columnsList: N = columns['N'] self.assertEqual(N, rowCount) name = columns['name'] stype = columns['type'] self.assertEqual(stype, 'number') histogram = columns['histogram'] bin_size = histogram['bin_size'] self.assertEqual(bin_size, 1) bin_names = histogram['bin_names'] bins = histogram['bins'] nbins = histogram['bins'] for b in bins: e = .1 * rowCount self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, msg="Bins not right. b: %s e: %s" % (b, e)) # not done if enum if stype != "enum": smax = columns['max'] smin = columns['min'] percentiles = columns['percentiles'] thresholds = percentiles['thresholds'] values = percentiles['values'] mean = columns['mean'] sigma = columns['sigma'] self.assertEqual(smax[0], expectedMax) self.assertEqual(smax[1], expectedMax-1) self.assertEqual(smax[2], expectedMax-2) self.assertEqual(smax[3], expectedMax-3) self.assertEqual(smax[4], expectedMax-4) self.assertEqual(smin[0], expectedMin) self.assertEqual(smin[1], expectedMin+1) self.assertEqual(smin[2], expectedMin+2) self.assertEqual(smin[3], expectedMin+3) self.assertEqual(smin[4], expectedMin+4) # apparently our 'percentile estimate" uses interpolation, so this check is not met by h2o for v in values: ## self.assertIn(v,legalValues,"Value in percentile 'values' is not present in the dataset") # but: you would think it should be within the min-max range? self.assertTrue(v >= expectedMin, "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin)) self.assertTrue(v <= expectedMax, "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax)) self.assertAlmostEqual(mean, (expectedMax+expectedMin)/2.0, delta=0.1) # FIX! how do we estimate this self.assertAlmostEqual(sigma, 2.9, delta=0.1) # since we distribute the outputs evenly from 0 to 9, we can check # that the value is equal to the threshold (within some delta # is this right? # if thresholds = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] # values = [ 0, 0, 1, 2, 3, 5, 7, 7, 9, 9, 10] eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0] if expectedMin==1: eV = eV1 elif expectedMin==0: eV = [e-1 for e in eV1] elif expectedMin==2: eV = [e+1 for e in eV1] else: raise Exception("Test doesn't have the expected values for expectedMin: %s" % expectedMin) for t,v,e in zip(thresholds, values, eV): m = "Percentile threshold: %s with value %s should ~= %s" % (t, v, e) self.assertAlmostEqual(v, e, delta=0.5, msg=m) trial += 1 if (1==0): generate_scipy_comparison(csvPathname)
def test_parse_bounds_libsvm (self): print "Random 0/1 for col1. Last has max col = 1, All have zeros for class." ## h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 100, 'cA', 300), (100000, 100, 'cB', 300), (100, 100000, 'cC', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] row_size = inspect['row_size'] value_size_bytes = inspect['value_size_bytes'] print "\n" + csvPathname, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) # summary respects column limits col_limit = int(floor( 0.3 * colNumberMax )) # trigger an fvec conversion h2o.beta_features = True print "Do a summary2, which triggers a VA to fvec" summaryResult = h2o_cmd.runSummary(key=hex_key, max_ncols=col_limit, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) h2o.beta_features = False print "Go back to VA" # self.assertEqual(col_limit, len( summaryResult[ 'summary'][ 'columns' ] ), # "summary doesn't respect column limit of %d on %d cols" % (col_limit, colNumberMax+1)) summaryResult = h2o_cmd.runSummary(key=hex_key, max_column_display=10*num_cols, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) summary = summaryResult['summary'] columnsList = summary['columns'] self.assertEqual(colNumberMax+1, len(columnsList), msg="generated %s cols (including output). summary has %s columns" % (colNumberMax+1, len(columnsList))) for columns in columnsList: N = columns['N'] # self.assertEqual(N, rowCount) name = columns['name'] stype = columns['type'] histogram = columns['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] bins = histogram['bins'] nbins = histogram['bins'] # definitely not enums zeros = columns['zeros'] na = columns['na'] smax = columns['max'] smin = columns['min'] mean = columns['mean'] sigma = columns['sigma'] # a single 1 in the last col if name == "V" + str(colNumberMax): # h2o puts a "V" prefix synZeros = num_rows - 1 synSigma = None # not sure..depends on the # rows somehow (0 count vs 1 count) synMean = 1.0/num_rows # why does this need to be a 1 entry list synMin = [0.0, 1.0] synMax = [1.0, 0.0] elif name == ("V1"): # can reverse-engineer the # of zeroes, since data is always 1 synSum = synColSumDict[1] # could get the same sum for all ccols synZeros = num_rows - synSum synSigma = 0.50 synMean = (synSum + 0.0)/num_rows synMin = [0.0, 1.0] synMax = [1.0, 0.0] else: synZeros = num_rows synSigma = 0.0 synMean = 0.0 synMin = [0.0] synMax = [0.0] # print zeros, synZeros self.assertAlmostEqual(float(mean), synMean, places=6, msg='col %s mean %s is not equal to generated mean %s' % (name, mean, 0)) # why are min/max one-entry lists in summary result. Oh..it puts N min, N max self.assertTrue(smin >= synMin, msg='col %s min %s is not >= generated min %s' % (name, smin, synMin)) self.assertTrue(smax <= synMax, msg='col %s max %s is not <= generated max %s' % (name, smax, synMax)) # reverse engineered the number of zeroes, knowing data was always 1 if present? if name == "V65536" or name == "V65537": print "columns around possible zeros mismatch:", h2o.dump_json(columns) self.assertEqual(zeros, synZeros, msg='col %s zeros %s is not equal to generated zeros count %s' % (name, zeros, synZeros)) self.assertEqual(stype, 'number', msg='col %s type %s is not equal to %s' % (name, stype, 'number')) # our random generation will have some variance for col 1. so just check to 2 places if synSigma: self.assertAlmostEqual(float(sigma), synSigma, delta=0.03, msg='col %s sigma %s is not equal to generated sigma %s' % (name, sigma, synSigma)) self.assertEqual(0, na, msg='col %s num_missing_values %d should be 0' % (name, na))
def test_four_billion_rows(self): timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ ("four_billion_rows.csv", "a.hex"), ("four_billion_rows.csv", "b.hex"), ] for (csvFilename, hex_key) in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print "Parse result['destination_key']:", parseResult['destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] byteSize = inspect['byteSize'] print "\n" + csvFilename, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(2, numCols, msg="generated %s cols (including output). parsed to %s cols" % (2, numCols)) self.assertEqual(4*1000000000, numRows, msg="generated %s rows, parsed to %s rows" % (4*1000000000, numRows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'max_iter': 20, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # Exec to make binomial######################## execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, 1+1, hex_key, 1+1, 0) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # GLM********************************* print "\n" + csvFilename colX = 0 kwargs = { 'response': 'C2', 'n_folds': 0, 'cols': colX, 'alpha': 0, 'lambda': 0, 'family': 'binomial', # 'link' can be family_default, identity, logit, log, inverse, tweedie } # one coefficient is checked a little more # L2 timeoutSecs = 900 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "/home/0xdiag/datasets/standard" csvFilelist = [ ("covtype.data", 300), ] # IMPORT********************************************** # H2O deletes the source key. So re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) # the list could be from hdfs/s3 (ec2 remap) or local. They have to different list structures if 'succeeded' in importFolderResult: succeededList = importFolderResult['succeeded'] elif 'files' in importFolderResult: succeededList = importFolderResult['files'] else: raise Exception("Can't find 'files' or 'succeeded' in import list") ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 3, "Should see more than 3 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** key2 = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseKey['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w") result = json.dump(storeViewResult, f, indent=4, sort_keys=True, default=str) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_four_billion_rows(self): timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ ("four_billion_rows.csv", "a.hex"), ("four_billion_rows.csv", "b.hex"), ] for (csvFilename, hex_key) in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] value_size_bytes = inspect['value_size_bytes'] row_size = inspect['row_size'] print "\n" + csvFilename, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(2, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (2, num_cols)) self.assertEqual(4*1000000000, num_rows, msg="generated %s rows, parsed to %s rows" % (4*1000000000, num_rows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'epsilon': 1e-6, 'max_iter': 20, 'cols': None, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1} # one coefficient is checked a little more colX = 0 # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
def test_c5_KMeans_sphere_26GB_fvec(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' csvFilename = 'syn_sphere15_gen_26GB.csv' # csvFilename = 'syn_sphere_gen_h1m.csv' # csvFilename = 'syn_sphere_gen_real_1.49M.csv' # csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? if NA_COL_BUG: expected = [ # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB, # so shouldn't be used for 26GB # or it should be divided by 7 # the distribution is the same, obviously. ([ -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] else: expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse( path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs) else: parseResult = h2i.import_parse( path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=300) numRows = inspect['numRows'] numCols = inspect['numCols'] summary = h2o_cmd.runSummary(key=parseResult['destination_key'], numRows=numRows, numCols=numCols, timeoutSecs=300) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 500, # 'normalize': 1, 'normalize': 0, # temp try 'initialization': 'Furthest', 'destination_key': 'junk.hex', # we get NaNs if whole col is NA 'ignored_cols': 'C1', 'normalize': 0, # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial % 3) == 0: kwargs['initialization'] = 'PlusPlus' elif (trial % 3) == 1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeansResult) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) # his does predict (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeansResult, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # these clusters were sorted compared to the cluster order in training h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial) # why is the expected # of rows not right in KMeans2. That means predictions are wrong h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial) # the tupleResultList has the size during predict? compare it to the sizes during training # I assume they're in the same order. model = kmeansResult['model'] size = model['size'] size2 = [t[1] for t in tupleResultList] if 1 == 1: # debug print "training size:", size print "predict size2:", size2 print "training sorted(size):", sorted(size) print "predict sorted(size2):", sorted(size2) print h2o.nodes[0].http_addr print h2o.nodes[0].port clusters = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] print "iterations", iterations if iterations >= ( max_iter - 1): # h2o hits the limit at max_iter-1..shouldn't hit it raise Exception( "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s", (trial, iterations, max_iter)) # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure # can't do this compare, because size2 is sorted by center order.. # so we don't know how to reorder size the same way # we could just sort the two of them, for some bit of comparison. if sorted(size) != sorted(size2): raise Exception( "trial: %s training cluster sizes: %s not the same as predict on same data: %s" % (trial, size, size2)) # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram expectedSize = [t[1] / SCALE_SIZE for t in expected] if size2 != expectedSize: raise Exception( "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize)) if DELETE_KEYS_EACH_ITER: h2i.delete_keys_at_all_nodes()
def test_four_billion_rows_fvec(self): h2o.beta_features = True timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ "four_billion_rows.csv", ] for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, pollTimeoutSecs=180) elapsed = time.time() - start print "Parse result['destination_key']:", parseResult['destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] byteSize = inspect['byteSize'] print "\n" + csvFilename, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols), \ " byteSize:", "{:,}".format(byteSize) expectedRowSize = numCols * 1 # plus output # expectedValueSize = expectedRowSize * numRows expectedValueSize = 8001271520 self.assertEqual(byteSize, expectedValueSize, msg='byteSize %s is not expected: %s' % \ (byteSize, expectedValueSize)) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(2, numCols, msg="generated %s cols (including output). parsed to %s cols" % (2, numCols)) self.assertEqual(4*1000000000, numRows, msg="generated %s rows, parsed to %s rows" % (4*1000000000, numRows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'max_iter': 4, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'response': 'C1', 'n_folds': 0, 'family': 'binomial', } # one coefficient is checked a little more colX = 1 # convert to binomial execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % ('C1', 'C1', 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
def test_parse_bounds_libsvm (self): print "Empty rows except for the last, with all zeros for class. Single col at max" h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 100, 'cA', 300), (100000, 100, 'cB', 300), (100, 10000, 'cC', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # just want to see if we stack trace on these for x in range(num_cols): print "Doing summary with x=%s" % x summaryResult = h2o_cmd.runSummary(key=key2, x=x, timeoutSecs=timeoutSecs) # skip the infoFromSummary check if x==0: colName = "Target" else: colName = "V" + str(x) print "Doing summary with col name x=%s" % colName summaryResult = h2o_cmd.runSummary(key=key2, x=x, timeoutSecs=timeoutSecs) # skip the infoFromSummary check # do a final one with all columns for the current check below # FIX! we should update the check to check each individual summary result print "Doing and checking summary with no x=%s" % x summaryResult = h2o_cmd.runSummary(key=key2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) summary = summaryResult['summary'] columnsList = summary['columns'] self.assertEqual(colNumberMax+1, len(columnsList), msg="generated %s cols (including output). summary has %s columns" % (colNumberMax+1, len(columnsList))) for columns in columnsList: N = columns['N'] # self.assertEqual(N, rowCount) name = columns['name'] stype = columns['type'] histogram = columns['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] bins = histogram['bins'] nbins = histogram['bins'] # definitely not enums zeros = columns['zeros'] na = columns['na'] smax = columns['max'] smin = columns['min'] mean = columns['mean'] sigma = columns['sigma'] # a single 1 in the last col # print name if name == ("V" + str(colNumberMax)): # h2o puts a "V" prefix synMean = 1.0/num_rows # why does this need to be a 1 entry list synMin = [0.0, 1.0] synMax = [1.0, 0.0] else: synMean = 0.0 synMin = [0.0] synMax = [0.0] self.assertEqual(float(mean), synMean, msg='col %s mean %s is not equal to generated mean %s' % (name, mean, 0)) # why are min/max one-entry lists in summary result. Oh..it puts N min, N max self.assertEqual(smin, synMin, msg='col %s min %s is not equal to generated min %s' % (name, smin, synMin)) self.assertEqual(smax, synMax, msg='col %s max %s is not equal to generated max %s' % (name, smax, synMax)) self.assertEqual(0, na, msg='col %s num_missing_values %d should be 0' % (name, na))
def test_many_fp_formats_libsvm (self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30,'sparse'), (100, 100, 'cF', 30,'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0,47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use num_cols?. num_cols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols)) syn = {} if k==0: syn['name'] = "Target" syn['size'] = {1,2} # can be two if we actually used the full range 0-255 (need extra for h2o NA) syn['type'] = {'int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' syn['scale'] = {1} # syn['base'] = 0 # syn['variance'] = 0 elif k==1: # we forced this to always be 0 syn['name'] = "V" + str(k) syn['size'] = {1} syn['type'] = {'int'} syn['min'] = 0 syn['max'] = 0 syn['scale'] = {1} syn['base'] = 0 syn['variance'] = 0 else: syn['name'] = "V" + str(k) syn['size'] = {1,2,4,8} # can be 2, 4 or 8? maybe make this a set for membership check syn['type'] = {'int', 'float'} syn['min'] = valMin syn['max'] = valMax syn['scale'] = {1,10,100,1000} # syn['base'] = 0 # syn['variance'] = 0 syn['num_missing_values'] = 0 syn['enum_domain_size'] = 0 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue(syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue(syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'size' or synKey == 'scale' or synKey == 'type': if cols[synKey] not in syn[synKey]: # for debug of why it was a bad size print "cols size/min/max:", cols['size'], cols['min'], cols['max'] print "syn size/min/max:", syn['size'], syn['min'], syn['max'] raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual(syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_c5_KMeans_sphere_67MB_fvec(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 67306997 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # clear out all NAs (walk across cols)..clear to 0 # temp ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key) ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500) h2o_cmd.infoFromInspect(inspect, csvPathname) summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 10, 'normalize': 1, 'initialization': 'Furthest', 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work } if (trial%3)==0: kwargs['initialization'] = 'PlusPlus' elif (trial%3)==1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeans) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial) h2i.delete_keys_at_all_nodes()
def test_create_rebalance_2enum(self): # default params = {'rows': 100, 'cols': 1} for trial in range(20): # CREATE FRAME params################################################################ h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strict checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 # CREATE FRAME***************************************************** kwargs = params.copy() print kwargs timeoutSecs = 300 hex_key = 'temp1000.hex' cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs) # REBALANCE***************************************************** print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % (hex_key) start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) SEEDPERFILE = random.randint(0, sys.maxint) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", hex_key, 'to', rb_key, 'took', elapsed, 'seconds',\ # TO ENUM***************************************************** print "Now doing to_enum across all columns of %s" % rb_key for column_index in range(params['cols']): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=rb_key, column_index=column_index + 1) # print "\nto_enum result:", h2o.dump_json(result) summaryResult = h2o_cmd.runSummary(key=hex_key) # check that it at least is an enum column now, with no na's # just look at the column we touched column = summaryResult['summaries'][column_index] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # we have some # of na's in the columns...but there should not be 100% NA if nacnt >= params['rows']: raise Exception( "column %s, which has name '%s', somehow too many NAs after convert to Enum %s %s" % (column_index, colname, nacnt, params['rows'])) print "I suspect that columns that are constant, maybe with NAs also, don't convert to Enum" if stattype != 'Enum': raise Exception( "column %s, which has name '%s', didn't convert to Enum, is %s %s %s" % (column_index, colname, stattype, coltype, h2o.dump_json(column))) cardinality = stats['cardinality'] # don't know the cardinality expected # if cardinality!=4: # raise Exception("column %s, which has name '%s', should have cardinality 4, got: %s" % # (column_index, colname, cardinality)) h2o_cmd.infoFromSummary(summaryResult) print "Trial #", trial, "completed"
def test_four_billion_rows_fvec(self): h2o.beta_features = True timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ "four_billion_rows.csv", ] for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=3) elapsed = time.time() - start print "Parse result['destination_key']:", parseResult[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] byteSize = inspect['byteSize'] print "\n" + csvFilename, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols), \ " byteSize:", "{:,}".format(byteSize) expectedRowSize = numCols * 1 # plus output # expectedValueSize = expectedRowSize * numRows expectedValueSize = 8001271520 self.assertEqual(byteSize, expectedValueSize, msg='byteSize %s is not expected: %s' % \ (byteSize, expectedValueSize)) summaryResult = h2o_cmd.runSummary( key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, numCols, msg="generated %s cols (including output). parsed to %s cols" % (2, numCols)) self.assertEqual(4 * 1000000000, numRows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, numRows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'max_iter': 10, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=4, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'response': 'C1', 'n_folds': 0, 'family': 'binomial', } # one coefficient is checked a little more colX = 1 # convert to binomial execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % ('1', '1', 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
def test_exec_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 3, 2, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] for j in range(CUT_EXPR_CNT): print "Creating", CUT_EXPR_CNT, 'cut expressions' # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column # cel = colEnumList[c] cel = colEnumList # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' && '.join(cutExprList) print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] rowExpr = '%s[%s,];' % (hex_key, cutExpr) print "rowExpr:", rowExpr rowExprList.append(rowExpr) print "j:", j # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # print h2o.dump_json(inspect) rSummary = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(rSummary) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==0: start = time.time() e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1)) elapsed = time.time() - start print "exec 1 took", elapsed, "seconds." execTime = elapsed if 1==1: start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." if 1==0: gKey = random.choice(eKeys) # do a 2nd random to see if things blow up start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey)) elapsed = time.time() - start print "exec 3 took", elapsed, "seconds." if 1==1: inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_impute_with_na(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just insert some NAs and see what happens" inspect = h2o_cmd.runInspect(key=hex_key) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] missing_fraction = 0.1 # NOT ALLOWED TO SET AN ENUM COL? if 1 == 0: # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec? # just one in row 1 for enumCol in enumColList: print "hack: Putting NA in row 0 of col %s" % enumCol execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after exec:", missingValuesList if len(missingValuesList) != len(enumColList): raise Exception( "Didn't get missing values in expected number of cols: %s %s" % (enumColList, missingValuesList)) for trial in range(5): # copy the dataset hex_key2 = 'c.hex' execExpr = '%s = %s' % (hex_key2, hex_key) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) imvResult = h2o.nodes[0].insert_missing_values( key=hex_key2, missing_fraction=missing_fraction, seed=SEED) print "imvResult", h2o.dump_json(imvResult) # maybe make the output col a factor column # maybe one of the 0,1 cols too? # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns. # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3) print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before" expectedMissing = missing_fraction * origNumRows # per col enumColList = [49, 50, 51, 52, 53, 54] for e in enumColList: enumResult = h2o.nodes[0].to_enum(src_key=hex_key2, column_index=(e + 1)) inspect = h2o_cmd.runInspect(key=hex_key2) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(origNumRows, numRows) self.assertEqual(origNumCols, numCols) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList", missingValuesList if len(missingValuesList) != numCols: raise Exception( "Why is missingValuesList not right afer ToEnum2?: %s %s" % (enumColList, missingValuesList)) for mv in missingValuesList: self.assertAlmostEqual(mv, expectedMissing, delta=0.1 * mv, msg='mv %s is not approx. expected %s' % (mv, expectedMissing)) summaryResult = h2o_cmd.runSummary(key=hex_key2) h2o_cmd.infoFromSummary(summaryResult) # h2o_cmd.infoFromSummary(summaryResult) print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect" print "trial", trial print "expectedMissing:", expectedMissing print "Now get rid of all the missing values, but imputing means. We know all columns should have NAs from above" print "Do the columns in random order" # don't do the enum cols ..impute doesn't support right? if AVOID_BUG: shuffledColList = range(0, 49) # 0 to 48 execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # summaryResult = h2o_cmd.runSummary(key=hex_key2) # h2o_cmd.infoFromSummary(summaryResult) inspect = h2o_cmd.runInspect(key=hex_key2) numCols = inspect['numCols'] missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if len(missingValuesList) != 49: raise Exception( "expected missing values in all cols after pruning enum cols: %s" % missingValuesList) else: shuffledColList = range(0, 55) # 0 to 54 origInspect = inspect random.shuffle(shuffledColList) for column in shuffledColList: # get a random set of column. no duplicate. random order? 0 is okay? will be [] groupBy = random.sample(range(55), random.randint(0, 54)) # header names start with 1, not 0. Empty string if [] groupByNames = ",".join( map(lambda x: "C" + str(x + 1), groupBy)) # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap columnName = "C%s" % (column + 1) print "don't use mode if col isn't enum" badChoices = True while badChoices: method = random.choice(["mean", "median", "mode"]) badChoices = column not in enumColList and method == "mode" NEWSEED = random.randint(0, sys.maxint) print "does impute modify the source key?" # we get h2o error (argument exception) if no NAs impResult = h2o.nodes[0].impute(source=hex_key2, column=column, method=method) print "Now check that there are no missing values" print "FIX! broken..insert missing values doesn't insert NAs in enum cols" inspect = h2o_cmd.runInspect(key=hex_key2) numRows2 = inspect['numRows'] numCols2 = inspect['numCols'] self.assertEqual( numRows, numRows2, "imput shouldn't have changed frame numRows: %s %s" % (numRows, numRows2)) self.assertEqual( numCols, numCols2, "imput shouldn't have changed frame numCols: %s %s" % (numCols, numCols2)) # check that the mean didn't change for the col # the enum cols with mode, we'll have to think of something else missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if missingValuesList: raise Exception( "Not expecting any missing values after imputing all cols: %s" % missingValuesList) cols = inspect['cols'] origCols = origInspect['cols'] for i, (c, oc) in enumerate(zip(cols, origCols)): # I suppose since we impute to either median or mean, we can't assume the mean stays the same # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true h2o_util.approxEqual( c['mean'], oc['mean'], tol=0.000000001, msg= "col %i original mean: %s not equal to mean after impute: %s" % (i, c['mean'], oc['mean']))
def test_exec2_na2mean(self): h2o.beta_features = True print "https://0xdata.atlassian.net/browse/PUB-228" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) 'rcnt = c(0)', 'total = c(0)', 'mean = c(0)', 's.hex = r.hex', "x=r.hex[,1]; rcnt=nrow(x)-sum(is.na(x))", "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x))", "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x)); mean=total / rcnt", "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x)); mean=total / rcnt; x=ifelse(is.na(x),mean,x)", ] execExprList2 = [ "s.hex = apply(r.hex,2," + "function(x){total=sum(ifelse(is.na(x),0,x)); " + \ "rcnt=nrow(x)-sum(is.na(x)); " + \ "mean=total / rcnt; " + \ "ifelse(is.na(x),mean,x)} " + \ ")" , # this got an exception. note I forgot to assign to x here "s=r.hex[,1]; s.hex[,1]=ifelse(is.na(x),0,x)", # throw in a na flush to 0 "x=r.hex[,1]; s.hex[,1]=ifelse(is.na(x),0,x)", ] execExprList += execExprList2 results = [] for execExpr in execExprList: start = time.time() (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # unneeded but interesting results.append(result) print "exec end on ", "operators" , 'took', time.time() - start, 'seconds' print "exec result:", result print "exec result (full):", h2o.dump_json(resultExec) h2o.check_sandbox_for_errors() # compare it to summary rSummary = h2o_cmd.runSummary(key='r.hex', cols='0') h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s.hex', cols='0') h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0, 1859.0, 1859.0])
def test_c10_glm_fvec(self): print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' csvFilename = 'classification1Train.txt' csvPathname = importFolderPath + "/" + csvFilename start = time.time() # hack. force it to NA the header, so we have col names that are not customer senstive below parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False, header=0) print "Parse of", parseResult['destination_key'], "took", time.time( ) - start, "seconds" print "Parse result['destination_key']:", parseResult[ 'destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices y = 0 ignore_x = [] x = [ 6, 7, 8, 10, 12, 31, 32, 33, 34, 35, 36, 37, 40, 41, 42, 43, 44, 45, 46, 47, 49, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70 ] for i in range(numCols): if i not in x and i != y: ignore_x.append(i) # since we're no long zero based, increment by 1 ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x)) # GLM Train*********************************************************** keepPattern = None print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x print "ignore_x:", x kwargs = { 'response': y, 'ignored_cols': ignore_x, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Parse Test*********************************************************** GLMModel = glm['glm_model'] modelKey = GLMModel['_key'] csvFilename = 'classification1Test.txt' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time( ) - start, "seconds"
def test_parse_time(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename colCount = 6 rowCount = 10 headerData = rand_header(colCount) write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range(1): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsA = inspect['numRows'] numColsA = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsA, numRows=numRowsA, noPrint=True) print summaryResult h2o_cmd.infoFromSummary(summaryResult) (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictA or enumSizeDictA: raise Exception( "Should be empty? constantValuesDictA %s enumSizeDictA %s" % (constantValuesDictA, enumSizeDictA)) print "missingValuesListA", missingValuesListA # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty") self.assertEqual(numColsA, colCount) self.assertEqual(numRowsA, rowCount) # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) # interesting. what happens when we do csv download with time data? start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsB = inspect['numRows'] numColsB = inspect['numCols'] print "missingValuesListB", missingValuesListB summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsB, numRows=numRowsB, noPrint=True) (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictB or enumSizeDictB: raise Exception( "Should be empty? constantValuesDictB %s enumSizeDictB %s" % (constantValuesDictB, enumSizeDictB)) self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( numColsA, numColsB, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # but in this dataset we have a header too, so the row counts should be equal # if not, maybe the parse of our dataset didn't detect a row self.assertEqual( numRowsA, numRowsB, "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB)) # FIX! should do some comparison of values? # maybe can use exec to checksum the columns and compare column list. # or compare to expected values? (what are the expected values for the number for time inside h2o?) # FIX! should compare the results of the two parses. The infoFromInspect result? ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_many_fp_formats_libsvm_2 (self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), (100, 40000, 'cC', 300, 'sparse50'), (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0,47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs, doSummary=False) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseKey['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset ### print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = inspect['cols'][k]['mean'] # our fp formats in the syn generation sometimes only have two places? self.assertAlmostEqual(mean, synMean, places=0, msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) num_missing_values = inspect['cols'][k]['num_missing_values'] self.assertEqual(0, num_missing_values, msg='col %s num_missing_values %d should be 0' % (k, num_missing_values))
def test_c5_KMeans_sphere_26GB_fvec(self): # a kludge h2o.setup_benchmark_log() # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' csvFilename = "syn_sphere15_gen_26GB.csv" # csvFilename = 'syn_sphere_gen_h1m.csv' # csvFilename = 'syn_sphere_gen_real_1.49M.csv' # csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + "/" + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + "/" + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? if NA_COL_BUG: expected = [ # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB, # so shouldn't be used for 26GB # or it should be divided by 7 # the distribution is the same, obviously. ( [-113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988, ), ( [1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98, ), ( [5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253, ), ( [10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474, ), ( [11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094, ), ( [12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475, ), ( [19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035, ), ( [20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276, ), ( [21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314, ), ( [25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955, ), ( [39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215, ), ( [40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249, ), ( [42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379, ), ( [48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982, ), ( [147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646, ), ] else: expected = [ ( [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988, ), ( [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98, ), ( [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253, ), ( [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474, ), ( [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094, ), ( [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475, ), ( [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035, ), ( [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276, ), ( [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314, ), ( [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955, ), ( [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215, ), ( [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249, ), ( [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379, ), ( [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982, ), ( [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646, ), ] benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"] benchmarkLogging = ["cpu", "disk", "network", "iostats"] # IOStatus can hang? benchmarkLogging = ["cpu", "disk", "network"] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse( path=csvPathname, schema="hdfs", hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs ) else: parseResult = h2i.import_parse( path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs ) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed ) print "\n" + l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], timeoutSecs=300) numRows = inspect["numRows"] numCols = inspect["numCols"] summary = h2o_cmd.runSummary( key=parseResult["destination_key"], numRows=numRows, numCols=numCols, timeoutSecs=300 ) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { "k": 15, "max_iter": 500, # 'normalize': 1, "normalize": 0, # temp try "initialization": "Furthest", "destination_key": "junk.hex", # we get NaNs if whole col is NA "ignored_cols": "C1", "normalize": 0, # reuse the same seed, to get deterministic results "seed": 265211114317615310, } if (trial % 3) == 0: kwargs["initialization"] = "PlusPlus" elif (trial % 3) == 1: kwargs["initialization"] = "Furthest" else: kwargs["initialization"] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeansResult = h2o_cmd.runKMeans( parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs ) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) print "kmeans result:", h2o.dump_json(kmeansResult) l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString, ) print l h2o.cloudPerfH2O.message(l) # his does predict (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeansResult, csvPathname, parseResult, "d", **kwargs ) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # these clusters were sorted compared to the cluster order in training h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial) # why is the expected # of rows not right in KMeans2. That means predictions are wrong h2o_kmeans.compareResultsToExpected( self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial ) # the tupleResultList has the size during predict? compare it to the sizes during training # I assume they're in the same order. model = kmeansResult["model"] size = model["size"] size2 = [t[1] for t in tupleResultList] if 1 == 1: # debug print "training size:", size print "predict size2:", size2 print "training sorted(size):", sorted(size) print "predict sorted(size2):", sorted(size2) print h2o.nodes[0].http_addr print h2o.nodes[0].port clusters = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] print "iterations", iterations if iterations >= (max_iter - 1): # h2o hits the limit at max_iter-1..shouldn't hit it raise Exception( "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s", (trial, iterations, max_iter), ) # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure # can't do this compare, because size2 is sorted by center order.. # so we don't know how to reorder size the same way # we could just sort the two of them, for some bit of comparison. if sorted(size) != sorted(size2): raise Exception( "trial: %s training cluster sizes: %s not the same as predict on same data: %s" % (trial, size, size2) ) # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram expectedSize = [t[1] / SCALE_SIZE for t in expected] if size2 != expectedSize: raise Exception( "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize) ) if DELETE_KEYS_EACH_ITER: h2i.delete_keys_at_all_nodes()
def test_parse_bounds_csv(self): print "Random 0/1 for col1. Last has max col = 1, All have zeros for class." h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 100000, 'cB', 300), (1000, 1000, 'cA', 300), (1000, 999, 'cC', 300), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below synSumList = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE********************** parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # INSPECT******************* inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colCount, timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] row_size = inspect['row_size'] value_size_bytes = inspect['value_size_bytes'] print "\n" + csvPathname, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) iCols = inspect['cols'] iColNameToOffset = {} for iColDict in iCols: # even though 'offset' exists, we'll use 'name' as the common key # to compare inspect and summary results iName = iColDict['name'] iOffset = iColDict['offset'] iColNameToOffset[iName] = iOffset # just touching to make sure they are there num_missing_values = iColDict['num_missing_values'] iMin = float(iColDict['min']) iMax = float(iColDict['max']) iMean = float(iColDict['mean']) iVariance = float(iColDict['variance']) # SUMMARY******************************** summaryResult = h2o_cmd.runSummary(key=hex_key, max_column_display=colCount, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) summary = summaryResult['summary'] columnsList = summary['columns'] self.assertEqual( colCount, len(columnsList), msg= "generated %s cols (including output). summary has %s columns" % (colCount, len(columnsList))) for columns in columnsList: name = columns['name'] iOffset = iColNameToOffset[name] iColDict = iCols[iOffset] iMin = iColDict['min'] iMax = iColDict['max'] iMean = iColDict['mean'] iVariance = iColDict['variance'] iNumMissingValues = iColDict['num_missing_values'] # from the summary N = columns['N'] stype = columns['type'] histogram = columns['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] bins = histogram['bins'] nbins = histogram['nbins'] smax = columns['max'] smin = columns['min'] smean = columns['mean'] sigma = columns['sigma'] na = columns['na'] # no zeroes if enum, but we're not enum here zeros = columns['zeros'] self.assertEqual( iMin, smin[0], "inspect min %s != summary min %s" % (iMin, smin)) self.assertEqual( iMax, smax[0], "inspect max %s != summary max %s" % (iMax, smax)) self.assertEqual( iMean, smean, "inspect mean %s != summary mean %s" % (iMean, smean)) self.assertEqual( iVariance, sigma, "inspect variance %s != summary sigma %s" % (iVariance, sigma)) self.assertEqual( iNumMissingValues, na, "inspect num_missing_values %s != summary na %s" % (iNumMissingValues, na)) # no comparison for 'zeros' # now, also compare expected values if name == "V1": synNa = 0 # can reverse-engineer the # of zeroes, since data is always 1 synSum = synSumList[ 1] # could get the same sum for all ccols synZeros = num_rows - synSum synSigma = 0.50 synMean = (synSum + 0.0) / num_rows synMin = [0.0, 1.0] synMax = [1.0, 0.0] elif name == "V2": synSum = 0 synSigma = 0 synMean = 0 if DO_NAN: synZeros = 0 synNa = num_rows synMin = [] synMax = [] else: synZeros = num_rows synNa = 0 synMin = [0.0] synMax = [0.0] # a single 1 in the last col elif name == "V" + str(colCount - 1): # h2o puts a "V" prefix synNa = 0 synSum = synSumList[colCount - 1] synZeros = num_rows - 1 # stddev.p # http://office.microsoft.com/en-us/excel-help/stdev-p-function-HP010335772.aspx synMean = 1.0 / num_rows # why does this need to be a 1 entry list synSigma = math.sqrt(pow((synMean - 1), 2) / num_rows) print "last col with single 1. synSigma:", synSigma synMin = [0.0, 1.0] synMax = [1.0, 0.0] else: synNa = 0 synSum = 0 synZeros = num_rows synSigma = 0.0 synMean = 0.0 synMin = [0.0] synMax = [0.0] if DO_MEAN: self.assertAlmostEqual( float(smean), synMean, places=6, msg='col %s mean %s is not equal to generated mean %s' % (name, smean, synMean)) # why are min/max one-entry lists in summary result. Oh..it puts N min, N max self.assertTrue( smin >= synMin, msg='col %s min %s is not >= generated min %s' % (name, smin, synMin)) self.assertTrue( smax <= synMax, msg='col %s max %s is not <= generated max %s' % (name, smax, synMax)) # reverse engineered the number of zeroes, knowing data was always 1 if present? if name == "V65536" or name == "V65537": print "columns around possible zeros mismatch:", h2o.dump_json( columns) self.assertEqual( na, synNa, msg='col %s na %s is not equal to generated na %s' % (name, na, synNa)) self.assertEqual( zeros, synZeros, msg='col %s zeros %s is not equal to generated zeros %s' % (name, zeros, synZeros)) self.assertEqual(stype, 'number', msg='col %s type %s is not equal to %s' % (name, stype, 'number')) # our random generation will have some variance for col 1. so just check to 2 places if synSigma: self.assertAlmostEqual( float(sigma), synSigma, delta=0.03, msg='col %s sigma %s is not equal to generated sigma %s' % (name, sigma, synSigma))
def test_c10_glm_fvec(self): h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' csvFilename = 'classification1Train.txt' csvPathname = importFolderPath + "/" + csvFilename start = time.time() # hack. force it to NA the header, so we have col names that are not customer senstive below parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False, header=0) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices ignore_x = [] x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] for i in range(numCols): if i not in x: ignore_x.append(i) # since we're no long zero based, increment by 1 ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) # GLM Train*********************************************************** keepPattern = None y = 0 print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x print "ignore_x:", x kwargs = { 'response': y, 'ignored_cols': ignore_x, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Parse Test*********************************************************** GLMModel = glm['glm_model'] modelKey = GLMModel['_key'] csvFilename = 'classification1Test.txt' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"
def test_xl_ast_assert_ZZ(self): #***************************************** a = DF('a1') # inits to -1 checkAst(astForInit(a)) # I suppose use of the h2o inspect request is deprecated # h2o_cmd.runInspect uses Frames? if 1==0: inspect = h2o.n0.inspect(key=a) # str(a) becomes 'a1'. so this param should take type Key for key= print "a/a1:", dump_json(inspect) # let's use runSummary for fun..returns OutputObj for the col # will get from column 0, since column not specified summaryResult = h2o_cmd.runSummary(key=a) co = h2o_cmd.infoFromSummary(summaryResult) print "co.label:", co.label print "co.data:", co.data # how can we get a bunch of data? b = DF('b1') # inits to -1 checkAst(astForInit(b)) c = DF('c1') # inits to -1 checkAst(astForInit(c)) print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) h2p.yellow_print("Assign compare1") Assign(c[0], c[0] + 0) checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare2") Assign(c[0], c[0] - 0) checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare3") Assign(c[0], c[0] == 0) checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare4") Assign(c[0], c[0] != 0) checkAst("(= ([ %c1 #0 #0) (N ([ %c1 #0 #0) #0))") # h2o_xl.debugPrintEnable = True #***************************************** c = DF('c1') h2p.yellow_print("<<= compare1") c[0] <<= (c[0] + 0) checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))") h2p.yellow_print("<<= compare2") c[0] <<= (c[0] - 0) checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))") h2p.yellow_print("<<= compare3") c[0] <<= (c[0] == 0) checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))") #***************************************** c = DF('c1') # inits to -1 h2p.yellow_print("compare1") # doesn't assign result to a key?, gets result if scalar, otherwise gets a list or ??? # .result can give us scalar, list, Key, None # .result could be a property that triggers a csv download, if we didn't cache the scalar/list result because it was small? # i.e. check if .result_cached was None, when .result property is used (property to avoid the need for () result = Expr(c[0] == -1).result checkAst("(n ([ %c1 #0 #0) #-1)") h2p.yellow_print("Expr result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result)) assert result == 1.0, "%s %s" % (type(result), result) # real result? if result: print "true for if of result", type(result), result else: print "else for if of result", type(result), result #***************************************** # difference is this goes to a temp key, so if not scalar, you can still get the results by looking at the key result = Assign(None, c[0]==-1).result checkAst("(= !knon_0x1a34250 (n ([ %c1 #0 #0) #-1))") h2p.yellow_print("Assign result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result)) assert result == 1.0, "%s %s" % (type(result), result) # real result? if result: print "true if of result", result else: print "false if of result", result
def test_parse_time_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename colCount = 6 rowCount = 10 headerData = rand_header(colCount) write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range (1): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsA = inspect['numRows'] numColsA = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsA, numRows=numRowsA, noPrint=True) print summaryResult h2o_cmd.infoFromSummary(summaryResult) (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictA or enumSizeDictA: raise Exception("Should be empty? constantValuesDictA %s enumSizeDictA %s" % (constantValuesDictA, enumSizeDictA)) print "missingValuesListA", missingValuesListA # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty") self.assertEqual(numColsA, colCount) self.assertEqual(numRowsA, rowCount) # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) # interesting. what happens when we do csv download with time data? start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsB = inspect['numRows'] numColsB = inspect['numCols'] print "missingValuesListB", missingValuesListB summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsB, numRows=numRowsB, noPrint=True) (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictB or enumSizeDictB: raise Exception("Should be empty? constantValuesDictB %s enumSizeDictB %s" % (constantValuesDictB, enumSizeDictB)) self.assertEqual(missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result") self.assertEqual(numColsA, numColsB, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # but in this dataset we have a header too, so the row counts should be equal # if not, maybe the parse of our dataset didn't detect a row self.assertEqual(numRowsA, numRowsB, "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB) ) # FIX! should do some comparison of values? # maybe can use exec to checksum the columns and compare column list. # or compare to expected values? (what are the expected values for the number for time inside h2o?) # FIX! should compare the results of the two parses. The infoFromInspect result? ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_four_billion_rows(self): timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ ("four_billion_rows.csv", "a.hex"), ("four_billion_rows.csv", "b.hex"), ] for (csvFilename, hex_key) in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] value_size_bytes = inspect['value_size_bytes'] row_size = inspect['row_size'] print "\n" + csvFilename, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) summaryResult = h2o_cmd.runSummary( key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (2, num_cols)) self.assertEqual(4 * 1000000000, num_rows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, num_rows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'epsilon': 1e-6, 'max_iter': 20, 'cols': None, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1 } # one coefficient is checked a little more colX = 0 # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: # can't use inspect on a model key? now? model = kmeans['model'] model_key = model['_key'] centers = model['centers'] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] kmeansResult = kmeans else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) h2o.verboseprint('kmeans result:', h2o.dump_json(kmeansResult)) model = kmeansResult['KMeansModel'] centers = model['clusters'] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult['destination_key'], model_key=model_key, destination_key=predictKey) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult['summaries'][0]['hcnt'] # histogram rows_per_cluster = hcnt # FIX! does the cluster order/naming match, compared to cluster variances sqr_error_per_cluster = cluster_variances else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult['destination_key'], model_key=model_key, destination_key=applyDestinationKey) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score( key=parseResult['destination_key'], model_key=model_key) score = kmeansScoreResult['score'] rows_per_cluster = score['rows_per_cluster'] sqr_error_per_cluster = score['sqr_error_per_cluster'] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str( i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def test_four_billion_rows(self): h2o.beta_features = False timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ "four_billion_rows.csv", ] for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, pollTimeoutSecs=180) elapsed = time.time() - start print "Parse result['destination_key']:", parseResult[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] # forget about checking the bytesize print "\n" + csvFilename, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) expectedRowSize = num_cols * 1 # plus output # expectedValueSize = expectedRowSize * num_rows summaryResult = h2o_cmd.runSummary( key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (2, num_cols)) self.assertEqual(4 * 1000000000, num_rows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, num_rows)) # KMeans********************************* kwargs = { 'k': 3, 'cols': 'C1, C2', 'initialization': 'Furthest', 'max_iter': 4, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'y': 'C2', 'n_folds': 0, 'family': 'binomial', 'case_mode': '=', 'case': 1 } # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, 'C1', **kwargs)