def test_parse_1m_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [(10, 65000, "cH", 30)] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() print "Summary should work with 65k" parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, " num_rows:", "{:,}".format( inspect["num_rows"] ), " num_cols:", "{:,}".format(inspect["num_cols"]) # should match # of cols in header or ?? self.assertEqual( inspect["num_cols"], colCount, "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount), ) self.assertEqual( inspect["num_rows"], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % (inspect["num_rows"], rowCount), ) # we should obey max_column_display column_limits = [25, 25000, 50000] for column_limit in column_limits: inspect = h2o_cmd.runInspect( None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs ) self.assertEqual( len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit) ) for r in range(0, len(inspect["rows"])): # NB: +1 below because each row includes a row header row: #{row} self.assertEqual( len(inspect["rows"][r]), column_limit + 1, "inspect data rows obeys max_column_display = " + str(column_limit), )
def test_from_import_fvec(self): csvFilenameAll = [ ("covtype.data", 500), # ("covtype20x.data", 1000), ] for (csvFilename, timeoutSecs) in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True) h2o_cmd.infoFromInspect(inspect, parseResult['destination_key']) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) # h2o_cmd.infoFromSummary(summaryResult) trees = 2 start = time.time() rfView = h2o_cmd.runRF(trees=trees, max_depth=20, balance_classes=0, importance=1, parseResult=parseResult, timeoutSecs=timeoutSecs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n"+l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def parseFile(self, importFolderPath='datasets', csvFilename='airlines_all.csv', timeoutSecs=500, **kwargs): csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs) elapsed = time.time() - start print "Parse of", parseResult[ 'destination_key'], "took", elapsed, "seconds" parseResult['python_call_timer'] = elapsed print "Parse result['destination_key']:", parseResult[ 'destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=200) elapsed = time.time() - start print "Inspect:", parseResult[ 'destination_key'], "took", elapsed, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] print "numRows:", numRows, "numCols", numCols return parseResult
def test_many_cols_and_types(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5, 'cA', 5), (1000, 59, 'cB', 5), (5000, 128, 'cC', 5), (6000, 507, 'cD', 5), (9000, 663, 'cE', 5), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvFilename
def test_rf_allyears2k_oobe(self): importFolderPath = '/home/0xdiag/datasets' csvFilename = 'allyears2k.csv' csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) for trial in range(10): kwargs = paramDict timeoutSecs = 30 + kwargs['ntree'] * 2 start = time.time() # randomize the node node = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] rfView = h2o_cmd.runRFOnly(node=node, parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix']['classification_error'] rows_skipped = rfView['confusion_matrix']['rows_skipped'] mtry = rfView['mtry'] mtry_nodes = rfView['mtry_nodes'] print "mtry:", mtry print "mtry_nodes:", mtry_nodes self.assertEqual(classification_error, 0, "Should have zero oobe error") self.assertEqual(rows_skipped, 39, "Should have exactly 39 rows skipped") print "Trial #", trial, "completed"
def test_KMeans_covtype_cols_fvec(self): h2o.beta_features = True # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("covtype.binary.svm", "cC", 30, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False importFolderPath = "libsvm" for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['destination_key']:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # KMEANS****************************************** for trial in range(1): kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': range(11, numCols), 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], for trial2 in range(3): timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def bigCheckResults(self, kmeans, csvPathname, parseKey, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) model_key = kmeans['destination_key'] kmeansResult = h2o_cmd.runInspect(key=model_key) centers = kmeansResult['KMeansModel']['clusters'] kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseKey['destination_key'], model_key=model_key, destination_key=applyDestinationKey) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) kmeansScoreResult = h2o.nodes[0].kmeans_score( key=parseKey['destination_key'], model_key=model_key) score = kmeansScoreResult['score'] rows_per_cluster = score['rows_per_cluster'] sqr_error_per_cluster = score['sqr_error_per_cluster'] tupleResultList = [] for i,c in enumerate(centers): print "\ncenters["+str(i)+"]: ", centers[i] print "rows_per_cluster["+str(i)+"]: ", rows_per_cluster[i] print "sqr_error_per_cluster["+str(i)+"]: ", sqr_error_per_cluster[i] tupleResultList.append( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) ) return (centers, tupleResultList)
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs) print h2o.dump_json(importResult) storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30) # print h2o.dump_json(storeViewResult) hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def import_frame(self, target_key, bucket, csvFilename, csvPathname, expected_rows, expected_cols): path = csvPathname + '/' + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=path, hex_key=target_key, schema='put') # upload the file destination_key = parseResult[ 'destination_key'] # we block until it's actually ready inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) actual_rows = inspect['numRows'] actual_cols = inspect['numCols'] print 'loaded frame "' + target_key + '" from path: ' + path print 'rows: ', actual_rows print 'cols: ', actual_cols # Don't have access to the testCase assert methods here because they aren't class methods. :-( assert expected_rows == actual_rows, "Expected " + str( expected_rows) + " but got " + str( actual_rows) + " for path: " + path assert expected_cols == actual_cols, "Expected " + str( expected_cols) + " but got " + str( actual_cols) + " for path: " + path # TODO: other info we could check # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) # # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) # h2o_cmd.infoFromSummary(summaryResult) # , noPrint=True return destination_key
def test_parse_200k_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 100000, 'cA', 200, 200), # (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult['destination_key'], "took", time.time( ) - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def bigCheckResults(self, kmeans, csvPathname, parseKey, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) model_key = kmeans['destination_key'] kmeansResult = h2o_cmd.runInspect(key=model_key) centers = kmeansResult['KMeansModel']['clusters'] kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseKey['destination_key'], model_key=model_key, destination_key=applyDestinationKey) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) kmeansScoreResult = h2o.nodes[0].kmeans_score( key=parseKey['destination_key'], model_key=model_key) score = kmeansScoreResult['score'] rows_per_cluster = score['rows_per_cluster'] sqr_error_per_cluster = score['sqr_error_per_cluster'] tupleResultList = [] for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", centers[i] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str( i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def createTestTrain(srcKey, trainDstKey, testDstKey, percent, outputClass, numCols): # will have to live with random extract. will create variance print "train: get random %. change class 4 to 1, everything else to 0. factor() to turn real to int (for rf)" # Create complexity for no good reason!. Do the same thing 5 times in the single exec expressions execExpr = "" STUPID_REPEAT = 20 for i in range(STUPID_REPEAT): execExpr += "a.hex=runif(%s);" % srcKey execExpr += "%s=%s[a.hex%s,];" % (trainDstKey, srcKey, '<=0.9') if not DO_MULTINOMIAL: execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, numCols, trainDstKey, numCols, outputClass) execExpr += "factor(%s[, %s]);" % (trainDstKey, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=trainDstKey, timeoutSecs=STUPID_REPEAT * 15) inspect = h2o_cmd.runInspect(key=trainDstKey) h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) ) print "test: same, but use the same runif() random result, complement" execExpr = "a.hex=runif(%s);" % srcKey execExpr += "%s=%s[a.hex%s,];" % (testDstKey, srcKey, '>0.9') if not DO_MULTINOMIAL: execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, numCols, testDstKey, numCols, outputClass) execExpr += "factor(%s[, %s])" % (testDstKey, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=testDstKey, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=testDstKey) h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (testDstKey, srcKey) )
def test_one_hot_expand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 1100, 'cA', 5), (100, 1000, 'cB', 5), (100, 900, 'cC', 5), (100, 800, 'cD', 5), (100, 700, 'cE', 5), (100, 600, 'cF', 5), (100, 500, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # does it modify the original or ? oneHotResult = h2o.nodes[0].one_hot(source=parseResult['destination_key']) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname)
def test_c7_rel(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=True) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 if DO_GLM: start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False)
def test_parse_summary_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' timeoutSecs = 300 csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary) csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' validation_key = 'test.hex' parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary)
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=hex_key, destination_key=predictHexKey) print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o)!=str(p): if wrong==10: print "Not printing any more mismatches\n" elif wrong<10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong return pctWrong
def test_many_cols_and_types(self): SEED = random.randint(0, sys.maxint) print "\nUsing random seed:", SEED # SEED = random.seed(SEED) SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5, "cA", 5), (1000, 59, "cB", 5), (5000, 128, "cC", 5), (6000, 507, "cD", 5), (9000, 663, "cE", 5), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvFilename
def test_big_sum_fail(self): node = h2o.nodes[0] SYNDATASETS_DIR = h2o.make_syn_dir() csvPathname = SYNDATASETS_DIR + '/temp.csv' hex_key = 'temp.hex' for trial in range(5): # what about seed? cfResult = h2o.nodes[0].create_frame(key=hex_key, binary_ones_fraction=0.02, binary_fraction=0, randomize=1, missing_fraction=0, integer_fraction=1, real_range=100, has_response=0, response_factors=2, factors=100, cols=1, integer_range=100, value=0, categorical_fraction=0, rows=2.5e+08, timeoutSecs=300) inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect, hex_key) if UNNECESSARY: # this is just doing a head to R. not critical h2e.exec_expr(execExpr="%s = %s" % (hex_key, hex_key)) h2e.exec_expr(execExpr="Last.value.0 = %s[c(1,2,3,4,5,6),]" % hex_key) h2e.exec_expr(execExpr="Last.value.0 = Last.value.0") node.csv_download(src_key="Last.value.0", csvPathname=csvPathname) node.remove_key("Last.value.0") # not sure why this happened h2o_cmd.runStoreView(view=10000, offset=0) # Fails on this h2e.exec_expr(execExpr='Last.value.1 = %s[,1]' % hex_key) print "Trial #", trial, "completed"
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30) start = time.time() predict = h2o_nodes.nodes[0].generate_predictions(model_key=model_key, data_key=hex_key, destination_key=predictHexKey) print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds' check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o_nodes.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o_nodes.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o)!=str(p): if wrong==10: print "Not printing any more mismatches\n" elif wrong<10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong return pctWrong
def import_frame(self, target_key, bucket, csvFilename, csvPathname, expected_rows, expected_cols): path = csvPathname + '/' + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=path, hex_key=target_key, schema='put') # upload the file destination_key = parseResult['destination_key'] # we block until it's actually ready inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) actual_rows = inspect['numRows'] actual_cols = inspect['numCols'] print 'loaded frame "' + target_key +'" from path: ' + path print 'rows: ', actual_rows print 'cols: ', actual_cols # Don't have access to the testCase assert methods here because they aren't class methods. :-( assert expected_rows == actual_rows, "Expected " + str(expected_rows) + " but got " + str(actual_rows) + " for path: " + path assert expected_cols == actual_cols, "Expected " + str(expected_cols) + " but got " + str(actual_cols) + " for path: " + path # TODO: other info we could check # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) # # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) # h2o_cmd.infoFromSummary(summaryResult) # , noPrint=True return destination_key
def do_summary_and_inspect(): # SUMMARY****************************************** summaryResult = h2o_cmd.runSummary(key=hex_key) coltypeList = h2o_cmd.infoFromSummary(summaryResult) # INSPECT****************************************** inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # Now check both inspect and summary if csvFilename=='covtype.binary.svm': for k in range(55): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0)) stype = inspect['cols'][k]['type'] print k, stype self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int')) # summary may report type differently than inspect..check it too! # we could check na here too for i,c in enumerate(coltypeList): print "column index: %s column type: %s" % (i, c) # inspect says 'int?" assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)
def test_randomFilter(self): SYNDATASETS_DIR = h2o.make_syn_dir() # use SEED so the file isn't cached? csvFilenameAll = [ ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 1mx8 csv" write_syn_dataset(csvPathname, 1000000, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_rf_big_rand_tree_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowCount = 5000 colCount = 1000 write_syn_dataset(csvPathname, rowCount, colCount) for trial in range (1): # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" seed = random.randint(0,sys.maxint) # some cols can be dropped due to constant 0 or 1. make sure data set has all 0's and all 1's above # to guarantee no dropped cols! # kwargs = {'ntree': 3, 'depth': 50, 'seed': seed} # out of memory/GC errors with the above. reduce depth kwargs = {'ntrees': 3, 'max_depth': 20, 'seed': seed} start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=90) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=600, pollTimeoutSecs=180, **kwargs) print "trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect, csvPathname) cols = inspect['cols'] numCols = inspect['numCols'] for i,c in enumerate(cols): colType = c['type'] self.assertEqual(colType, 'Int', msg="col %d should be type in: %s" % (i, colType)) h2o.check_sandbox_for_errors()
def test_many_cols_and_types(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5, 'cA', 5), (1000, 59, 'cB', 5), (5000, 128, 'cC', 5), (6000, 507, 'cD', 5), (9000, 663, 'cE', 5), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvFilename
def test_parse_summary_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' timeoutSecs = 300 csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(rSummary, rows=numRows, cols=numCols) csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' validation_key = 'test.hex' parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test) # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList) numCols = inspect['numCols'] numRows = inspect['numRows'] rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols) h2o_cmd.infoFromSummary(rSummary)
def test_plot_remove_keys_manyfiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() print "Remember, the parse only deletes what got parsed. We import the folder. So we double import. That should work now" tryList = [ ("file_1[0-9].dat.gz", 'c10', 600), ("file_[1-2][0-9].dat.gz", 'c20', 600), ("file_[1-4][0-9].dat.gz", 'c40', 600), ("file_[1-8][0-9].dat.gz", 'c80', 600), # don't do this case. timesout at 300 sec on polling with 172-180 # ("file_[1-2][1-8][0-9].dat.gz", 'c160', 1200), ] xList = [] eList = [] fList = [] importFolderPath = "manyfiles-nflx-gz" for (csvFilePattern, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, retryDelaySecs=3, timeoutSecs=timeoutSecs, doSummary=False) parseElapsed = time.time() - start print "Parse only:", parseResult['destination_key'], "took", parseElapsed, "seconds" h2o.check_sandbox_for_errors() # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) parsedBytes = inspect['byteSize'] node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(parsedBytes) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1==1: xLabel = 'parsedBytes' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() for trial in range(2): csvFilename = "syn_ints.csv" hex_key = "1.hex" csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, trial) timeoutSecs = 10 # have to import each time, because h2o deletes source after parse # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000) parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000) # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360) print "Inspect:", hex_key, "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(summaryResult) if DO_KMEANS: # KMEANS****************************************** kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_parse_200k_cols_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 1000, 'cA', 200, 200), (10, 2000, 'cA', 200, 200), (10, 4000, 'cA', 200, 200), (10, 8000, 'cA', 200, 200), (10, 9000, 'cA', 200, 200), (10, 10000, 'cA', 200, 200), # (10, 100000, 'cA', 200, 200), # (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() # does it blow up if it sets columnNames? parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False, columnNames=None, intermediateResults=DO_INTERMEDIATE_RESULTS) print "Parse:", csvFilename, "took", time.time() - start, "seconds" print "Skipping the row/cols check for now" if 1==0: start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) print "Skipping the delete keys for now" if 1==0: # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_frame_split_balance(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just split away and see if anything blows up" splitMe = hex_key inspect = h2o_cmd.runInspect(key=splitMe) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] for s in range(20): inspect = h2o_cmd.runInspect(key=splitMe) numRows = inspect['numRows'] numCols = inspect['numCols'] fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5) split0_key = fs['split_keys'][0] split1_key = fs['split_keys'][1] split0_rows = fs['split_rows'][0] split1_rows = fs['split_rows'][1] split0_ratio = fs['split_ratios'][0] split1_ratio = fs['split_ratios'][1] print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split1_key # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols) if split0_rows <= 2: break print "Now do some rebalancing on the split frames" for trial in range(2): rb_key = "rb_%s_%s" % (trial, splitMe) SEEDPERFILE = random.randint(0, sys.maxint) randChunks = random.randint(1, 100) start = time.time() print "Trial %s: Rebalancing %s to %s with %s chunks" % ( trial, splitMe, rb_key, randChunks) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, seed=SEEDPERFILE, chunks=randChunks) elapsed = time.time() - start print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\ h2o_cmd.runSummary(key=rb_key) print "\nInspecting the original parsed result" inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect=inspect) print "\nInspecting the rebalanced result with %s forced chunks" % randChunks inspect = h2o_cmd.runInspect(key=rb_key) h2o_cmd.infoFromInspect(inspect=inspect)
def test_c7_rel(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + csvFilename, " num_rows:", "{:,}".format(num_rows), " num_cols:", "{:,}".format(num_cols) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=num_cols, numRows=num_rows, max_column_display=2500) # it's in runSummary! # h2o_cmd.infoFromSummary(summaryResult, noPrint=False, numCols=num_cols, numRows=num_rows) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 if DO_GLM: start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_many_cols_and_values_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 60, 120), (100, 30000, 'cB', 60, 120), (100, 50000, 'cC', 60, 120), (100, 70000, 'cD', 60, 120), (100, 90000, 'cE', 60, 120), (100, 100000, 'cF', 60, 120), ] if not H2O_SUPPORTS_OVER_100K_COLS: print "Restricting number of columns tested to 100,000" else: tryList = tryList + [ (100, 200000, 'cG', 60, 120), (100, 300000, 'cH', 60, 120), (100, 400000, 'cI', 60, 120), (100, 500000, 'cJ', 60, 120), (100, 600000, 'cK', 60, 120), (100, 700000, 'cL', 60, 120), (100, 800000, 'cM', 60, 120), (100, 900000, 'cN', 60, 120), (100, 1000000, 'cO', 60, 120), ] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) sel = 0 csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) start = time.time() print csvFilename, "parse starting" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True) h2o.check_sandbox_for_errors() print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse and summary:", parseResult['destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def predict_and_compare_csvs(model_key): start = time.time() predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "runPredict end on ", hexKey, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} (rowNum1, originalOutput) = compare_csv_last_col(csvExecPathname, msg="Original, after being exec'ed", skipHeader=True) (rowNum2, predictOutput) = compare_csv_last_col(csvPredictPathname, msg="Predicted", skipHeader=True) # no header on source if (rowNum1 != rowNum2): raise Exception("original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \ %s" % (rowNum1, rowNum2)) wrong = 0 wrong0 = 0 wrong1 = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): o = float(o) p = float(p) if o!=p: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) if p==0.0 and wrong0==10: print "Not printing any more predicted=0 mismatches" elif p==0.0 and wrong0<10: print msg if p==1.0 and wrong1==10: print "Not printing any more predicted=1 mismatches" elif p==1.0 and wrong1<10: print msg if p==0.0: wrong0 += 1 elif p==1.0: wrong1 += 1 wrong += 1 print "wrong0:", wrong0 print "wrong1:", wrong1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 10.0: raise Exception("pct wrong too high. Expect < 10% error")
def predict_and_compare_csvs(model_key): start = time.time() predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "runPredict end on ", hexKey, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} (rowNum1, originalOutput) = compare_csv_last_col(csvExecPathname, msg="Original, after being exec'ed", skipHeader=True) (rowNum2, predictOutput) = compare_csv_last_col(csvPredictPathname, msg="Predicted", skipHeader=True) # no header on source if (rowNum1 != rowNum2): raise Exception("original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \ %s" % (rowNum1, rowNum2)) wrong = 0 wrong0 = 0 wrong1 = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): o = float(o) p = float(p) if o!=p: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) if p==0.0 and wrong0==10: print "Not printing any more predicted=0 mismatches" elif p==0.0 and wrong0<10: print msg if p==1.0 and wrong1==10: print "Not printing any more predicted=1 mismatches" elif p==1.0 and wrong1<10: print msg if p==0.0: wrong0 += 1 elif p==1.0: wrong1 += 1 wrong += 1 print "wrong0:", wrong0 print "wrong1:", wrong1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 16.0: raise Exception("pct wrong: %s too high. Expect < 16 pct error" % pctWrong)
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: model_key = kmeans["model"]["_selfKey"] # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame # can't use inspect on a model key? now? kmeansResult = kmeans model = kmeansResult["model"] centers = model["clusters"] error = model["error"] else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) model = kmeansResult["KMeansModel"] centers = model["clusters"] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey ) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult["summaries"][0]["hcnt"] # histogram rows_per_cluster = hcnt # have to figure out how to get this with fvec sqr_error_per_cluster = [0 for h in hcnt] else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey ) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key) score = kmeansScoreResult["score"] rows_per_cluster = score["rows_per_cluster"] sqr_error_per_cluster = score["sqr_error_per_cluster"] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", centers[i] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def test_parse_cust(self): # run as user 0xcustomer to get access (with .json config and ssh key file specified) importFolderPath = '/mnt/0xcustomer-datasets' pollTimeoutSecs = 120 retryDelaySecs = 30 timeoutSecs = 300 (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*") importFileList = importResult['files'] importFailList = importResult['fails'] importKeyList = importResult['keys'] importDelList = importResult['dels'] if len(importDelList)!=0: raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList)) if len(importFileList)<MINFILES: raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList)) if len(importKeyList)<MINFILES: raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList)) if len(importFailList)!=0: raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList)) # only parse files with .csv or .tsv in their name (no dirs like that?) goodKeyList = [key for key in importKeyList if ('.csv' in key or '.tsv' in key)] trial = 0 # just do 1? for i, importKey in enumerate(random.sample(goodKeyList,3)): print "importKey:", importKey trial +=1 start = time.time() # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like # force header=0..should mean headers get treated as NAs parseResult = h2i.parse_only(pattern=importKey, header=0, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] origKey = parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=origKey) h2o_cmd.infoFromInspect(inspect, origKey) execExpr = 'newKey = '+origKey+'[1,1]' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) newParseKey = {'destination_key': 'newKey'} h2o_cmd.checkKeyDistribution() h2o.nodes[0].remove_key(key=origKey) # a key isn't created for a scalar # h2o.nodes[0].remove_key(key='newKey') self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
def test_parse_bounds_libsvm(self): # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 1), # FIX! fails KMeansScore # not integer output # ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), ("mushrooms.svm", "cG", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 30, 1), ("news20.svm", "cH", 30, 1), ("tmc2007_train.svm", "cJ", 30, 1), ("covtype.binary.svm", "cC", 30, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse bucket = "home-0xdiag-datasets" csvPathname = "libsvm/" + csvFilename # PARSE****************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print csvPathname, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) # RF****************************************** kwargs = { 'ntree': 6, 'response_variable': 0, } timeoutSecs = 600 start = time.time() rf = h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs): simpleCheckKMeans(self, kmeans, **kwargs) if h2o.beta_features: # can't use inspect on a model key? now? model = kmeans["model"] model_key = model["_key"] centers = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] kmeansResult = kmeans else: model_key = kmeans["destination_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) h2o.verboseprint("kmeans result:", h2o.dump_json(kmeansResult)) model = kmeansResult["KMeansModel"] centers = model["clusters"] error = model["error"] if h2o.beta_features: # need to use Predict2? pass # no scoring on Kmeans2?..just reuse # cols/max_ncols params? predictKey = applyDestinationKey predictResult = h2o.nodes[0].generate_predictions( data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey ) summaryResult = h2o.nodes[0].summary_page(key=predictKey) hcnt = summaryResult["summaries"][0]["hcnt"] # histogram rows_per_cluster = hcnt # FIX! does the cluster order/naming match, compared to cluster variances sqr_error_per_cluster = cluster_variances else: kmeansApplyResult = h2o.nodes[0].kmeans_apply( data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey ) inspect = h2o_cmd.runInspect(None, applyDestinationKey) h2o_cmd.infoFromInspect(inspect, csvPathname) # this was failing summaryResult = h2o_cmd.runSummary(key=applyDestinationKey) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key) score = kmeansScoreResult["score"] rows_per_cluster = score["rows_per_cluster"] sqr_error_per_cluster = score["sqr_error_per_cluster"] tupleResultList = [] print "\nerror: ", error for i, c in enumerate(centers): print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]] print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i] print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i] tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i])) return (centers, tupleResultList)
def test_parse_manyfiles_1(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirname = "manyfiles-nflx-gz" timeoutSecs = 600 trial = 0 for iteration in range(ITERATIONS): csvFilename = "file_1.dat.gz" csvPathname = csvDirname + "/" + csvFilename trialStart = time.time() # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema=SCHEMA, hex_key=hex_key, delete_on_done=DELETE_ON_DONE, # importParentDir=IMPORT_PARENT_DIR, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "parse", trial, "end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 542) self.assertEqual(numRows, 100000) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone # goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" for node in h2o.nodes: h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000) # convert to binomial if DO_EXEC: execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=20) # execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' # h2e.exec_expr(execExpr=execExpr, timeoutSecs=20) if DO_DELETE_MYSELF: h2o_import.delete_keys_at_all_nodes() print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_500_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 500, 'cA', 1800, 1800), ] h2b.browseTheCloud() for (rowCount, colCount, orig_hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename # create sym links multifile = 1000 # there is already one file. assume it's the "0" case for p in range(1, multifile): csvPathnameLink = csvPathname + "_" + str(p) os.symlink(csvFilename, csvPathnameLink) print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) for trial in range(10): hex_key = orig_hex_key + str(trial) start = time.time() parseResult = h2i.import_parse(path=csvPathname + "*", schema='local', hex_key=hex_key, delete_on_done=1, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount * multifile, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount * multifile))
def test_fp_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if H2O_SUPPORTS_OVER_50K_COLS: tryList = [ (100, 200000, 'cG', 120, 120), (100, 300000, 'cH', 120, 120), (100, 400000, 'cI', 120, 120), (100, 500000, 'cJ', 120, 120), (100, 700000, 'cL', 120, 120), (100, 800000, 'cM', 120, 120), (100, 900000, 'cN', 120, 120), (100, 1000000, 'cO', 120, 120), (100, 1200000, 'cK', 120, 120), ] else: print "Restricting number of columns tested to 50,000" tryList = [ (100, 200000, 'cG', 400, 400), (100, 300000, 'cH', 400, 400), (100, 400000, 'cI', 400, 400), (100, 500000, 'cJ', 400, 400), ] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) sel = 0 csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) start = time.time() print csvFilename, "parse starting" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) h2o.check_sandbox_for_errors() print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse and summary:", parseResult['destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def test_parse_summary_manyfiles_s3_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [("manyfiles-nflx-gz", 800)] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: # change to 50 files csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs ) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, ) elapsed = time.time() - start print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_header_rows_mismatch(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL" totalCols = 8 totalRows = 10000 rList = rand_rowData(totalCols) write_syn_dataset(csvPathname, totalRows, headerData, rList) for trial in range(2): # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 30 print "Force it to think there's a header. using comma forced as separator" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, header=1, separator=44) print "parseKey['destination_key']: " + parseKey['destination_key'] print 'parse time:', parseKey['response']['time'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # should match # of cols in header or ?? self.assertEqual( inspect['num_cols'], totalCols, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], totalCols)) self.assertEqual( inspect['num_rows'], totalRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % (inspect['num_rows'], totalRows)) kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalRows:", totalRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def test_cols_enum_multi_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (300, 100, 'cA', 60, '*x[2-5]*'), (310, 200, 'cB', 60, '*x[1,3-5]*'), (320, 300, 'cC', 60, '*x[1-2,4-5]*'), (330, 400, 'cD', 60, '*x[1-3-5]*'), (340, 500, 'cE', 60, '*x[1-4]*'), ] h2b.browseTheCloud() cnum = 0 # create them all first for (rowCount, colCount, key2, timeoutSecs, excludePattern) in tryList: cnum += 1 # FIX! should we add a header to them randomly??? print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) for fileN in range(FILENUM): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList) for (rowCount, colCount, key2, timeoutSecs, excludePattern) in tryList: cnum += 1 # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. h2o.nodes[0].import_files(SYNDATASETS_DIR) # pattern match all, then use exclude parseKey = h2o.nodes[0].parse('*', key2=key2, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs) print "parseKey['destination_key']: " + parseKey['destination_key'] print 'parse time:', parseKey['response']['time'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # FIX! h2o strips one of the headers, but treats all the other files with headers as data num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + parseKey['destination_key'] + ":", \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) # all should have rowCount rows (due to the excludePattern self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \ (num_rows, rowCount, FILENUM)))
def test_storeview_import(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz doSummary = False parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary) print csvFilenameReplgz, 'parse time:', parseKey['response']['time'] if doSummary: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseKey['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # there is an extra response variable if inspect['num_cols'] != (colCount + 1): raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) if inspect['num_rows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount)) # hack it in! for test purposees only parseKey['python_source_key'] = csvFilenameReplgz parseKey['num_rows'] = inspect['num_rows'] parseKey['num_cols'] = inspect['num_cols'] parseKey['value_size_bytes'] = inspect['value_size_bytes'] return parseKey
def test_short(self): csvFilename = 'part-00000b' ### csvFilename = 'short' importFolderPath = '/home/hduser/data' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, separator=9) print "Parse of", parseKey['destination_key'], "took", time.time() - start, "seconds" print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=500) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_exec2_row_range(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [(1000000, 5, "cA", 200)] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False ) print "Parse:", parseResult["destination_key"], "took", time.time() - start, "seconds" inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) # should match # of cols in header or ?? self.assertEqual( inspect["numCols"], colCount, "parse created result with the wrong number of cols %s %s" % (inspect["numCols"], colCount), ) self.assertEqual( inspect["numRows"], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % (inspect["numRows"], rowCount), ) REPEAT = 1 for i in range(REPEAT): hex_key_i = hex_key + "_" + str(i) execExpr = "%s=%s[1,]" % (hex_key_i, hex_key) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, 100) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, rowCount - 10) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(None, hex_key_i, timeoutSecs=timeoutSecs) h2o_cmd.infoFromInspect(inspect, hex_key_i) print "\n" + hex_key_i, " numRows:", "{:,}".format( inspect["numRows"] ), " numCols:", "{:,}".format(inspect["numCols"])
def test_parse_summary_airline_s3(self): csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_rf_hhp_2a_fvec(self): h2o.beta_features = True csvFilenameList = { 'hhp.cut3.214.data.gz', } for csvFilename in csvFilenameList: csvPathname = csvFilename print "RF start on ", csvPathname dataKeyTrain = 'rTrain.hex' start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=dataKeyTrain, schema='put', timeoutSecs=120) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] # we want the last col. Should be values 0 to 14. 14 most rare # from the cut3 set # 84777 0 # 13392 1 # 6546 2 # 5716 3 # 4210 4 # 3168 5 # 2009 6 # 1744 7 # 1287 8 # 1150 9 # 1133 10 # 780 11 # 806 12 # 700 13 # 345 14 # 3488 15 execExpr = "%s[,%s] = %s[,%s]==14" % (dataKeyTrain, numCols, dataKeyTrain, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=dataKeyTrain) h2o_cmd.infoFromInspect(inspect, "going into RF") execResult = {'destination_key': dataKeyTrain} kwargs = { 'ntrees': 20, 'max_depth': 20, 'nbins': 50, } rfView = h2o_cmd.runRF(parseResult=execResult, timeoutSecs=900, retryDelaySecs=10, **kwargs) print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds' (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
def test_parse_summary_zip_s3_fvec(self): h2o.beta_features = True csvFilelist = [ ("test_set.zip", 300), # 110.9MB ("train_set.zip", 600), # 362.9MB ] (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3') print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** csvPathname = "allstate/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_loop_random_param_covtype(self): csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(5): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 5, 'n_folds': 1, 'family': "poisson", 'alpha': 0.0, 'lambda': 0, 'beta_epsilon': 0.001, 'max_iter': 3, 'standardize': 1, 'expert': 1, 'lsm_solver': 'GenGradient', } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds'] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_exec2_constants(self): print "Create some vectors from a constant" print "Don't really need a dataset, but .." for i in range(10): h2e.exec_zero_list(zeroList) inspect = h2o_cmd.runInspect(key='Result9') h2o_cmd.infoFromInspect(inspect, 'Result9') numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numRows, 1000000) self.assertEqual(numCols, 1)
def test_cols_enum_multi_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (300, 100, 'cA', 60, '*x[2-5]*'), (310, 200, 'cB', 60, '*x[1,3-5]*'), (320, 300, 'cC', 60, '*x[1-2,4-5]*'), (330, 400, 'cD', 60, '*x[1-3-5]*'), (340, 500, 'cE', 60, '*x[1-4]*'), ] ## h2b.browseTheCloud() cnum = 0 # create them all first for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # FIX! should we add a header to them randomly??? print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) for fileN in range(FILENUM): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList) for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: print f h2i.import_only(path=SYNDATASETS_DIR + "/" + f) # pattern match all, then use exclude parseResult = h2i.parse_only(pattern="*/syn_*", hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs) print "parseResult['destination_key']: " + parseResult['destination_key'] print 'parse time:', parseResult['response']['time'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # FIX! h2o strips one of the headers, but treats all the other files with headers as data num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + parseResult['destination_key'] + ":", \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) # all should have rowCount rows (due to the excludePattern self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \ (num_rows, rowCount, FILENUM)))
def test_parse_1m_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (10, 700000, 'cA', 30, 60), # (10, 800000, 'cB', 30, 70), # (10, 900000, 'cC', 30, 80), (10, 1000000, 'cD', 60, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=timeoutSecs, doSummary=False) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # should match # of cols in header or ?? self.assertEqual( inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def test_exec2_row_range(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse:", parseResult['destination_key'], "took", time.time() - start, "seconds" inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) REPEAT = 1 for i in range(REPEAT): hex_key_i = hex_key + "_" + str(i) execExpr = "%s=%s[1,]" % (hex_key_i, hex_key) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, 100) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, rowCount-10) resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(None, hex_key_i, timeoutSecs=timeoutSecs) h2o_cmd.infoFromInspect(inspect, hex_key_i) print "\n" + hex_key_i, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols'])
def test_parse_65k_cols_01(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 63000, 'cH', 100), (10, 65000, 'cH', 100), ] h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) start = time.time() print "Summary should work with 65k" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse and summary:", parseResult['destination_key'], "took", time.time() - start, "seconds" # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount)) # we should obey max_column_display column_limits = [25, 25000] for column_limit in column_limits: inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=column_limit, timeoutSecs=timeoutSecs) self.assertEqual(len( inspect['cols'] ) , column_limit, "inspect obeys max_column_display = " + str(column_limit)) for r in range(0, len( inspect[ 'rows' ] )): # NB: +1 below because each row includes a row header row: #{row} self.assertEqual(len( inspect['rows'][r] ) , column_limit + 1, "inspect data rows obeys max_column_display = " + str(column_limit))
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_GLM2_binomial_goalies(self): h2o.beta_features = True csvPathname = 'poisson/Goalies.csv' print "\nParsing", csvPathname parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="A.hex") inspect = h2o_cmd.runInspect(None, "A.hex") # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) case = 20 execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % (6 + 1, 6 + 1, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(5): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'response': 6, 'n_folds': 1, 'family': "binomial", 'alpha': 0, # seems we always need a little regularization 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 8 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds'] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult={'destination_key': 'A.hex'}, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"