def parseit(n, pattern, hex_key, timeoutSecs=60, retryDelaySecs=1, pollTimeoutSecs=30): h2i.parse_only( node=h2o.nodes[n], pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=PARSE_NOPOLL, ) print pattern, "started in parseit (nopoll)" return "Done"
def parseit(n, pattern, hex_key, timeoutSecs=60, retryDelaySecs=1, pollTimeoutSecs=30): h2i.parse_only(node=h2o.nodes[n], pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=PARSE_NOPOLL) print pattern, "started in parseit (nopoll)" return 'Done'
def test_exec2_fast_locks_fail(self): csvPathname = 'iris/iris2.csv' src_key = 'iris.csv' # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 for trial in range(1, 5): # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, delete_on_done=0, hex_key=hex_key, timeoutSecs=10) execExpr = "%s[,%s]=(%s[,%s]==%s)" % (hex_key, y + 1, hex_key, y + 1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_exec2_fast_locks(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_parse_airline_multi_hdfs(self): csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_airline_multi_hdfs_many(self): # default csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder for tryHeap in [24]: print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 500 importFolderPath = "datasets/airlines_multi" csvPathname = importFolderPath + "/" + csvFilePattern parseResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) for trial in range(TRIAL_MAX): # each parse now just does one csvFilePattern = "*%s.csv" % trial # if we want multifile # csvFilePattern = "*" hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() # print "Don't wait for completion. Just load things up!" print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?" print "Drat. We can't re-import the folder, if there's a parse using one of the source files?" parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_all_s3n_thru_hdfs(self): print "\nLoad a list of files from s3n, parse it thru HDFS" print "In EC2, michal's config always passes the right config xml" print "as arg to the java -jar h2o.jar. Only works in EC2" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/*' importResult = h2i.import_only(bucket=bucket, path=csvPathname, schema='s3n') s3nFullList = importResult['succeeded'] print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList),1,"Didn't see more than 1 files in s3n?") s3nList = random.sample(s3nFullList,8) timeoutSecs = 500 for s in s3nList: s3nKey = s['key'] s3nFilename = s['file'] # there is some non-file key names returned? s3n metadata? # only use the keys with csv in their name if ('csv' not in s3nKey) or ('syn_dataset' in s3nKey) or ('.gz' in s3nKey): continue # creates csvFilename.hex from file in hdfs dir print "Loading s3n key: ", s3nKey, 'thru HDFS' parseResult = h2i.parse_only(pattern=s3nKey, hex_key=s3nFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "parse result:", parseResult['destination_key'] start = time.time() sys.stdout.flush()
def test_parse_cust(self): # run as user 0xcustomer to get access (with .json config and ssh key file specified) importFolderPath = '/mnt/0xcustomer-datasets' pollTimeoutSecs = 120 retryDelaySecs = 30 timeoutSecs = 300 (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*") importFileList = importResult['files'] importFailList = importResult['fails'] importKeyList = importResult['keys'] importDelList = importResult['dels'] if len(importDelList)!=0: raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList)) if len(importFileList)<MINFILES: raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList)) if len(importKeyList)<MINFILES: raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList)) if len(importFailList)!=0: raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList)) # only parse files with .csv or .tsv in their name (no dirs like that?) goodKeyList = [key for key in importKeyList if ('.csv' in key or '.tsv' in key)] trial = 0 # just do 1? for i, importKey in enumerate(random.sample(goodKeyList,3)): print "importKey:", importKey trial +=1 start = time.time() # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like # force header=0..should mean headers get treated as NAs parseResult = h2i.parse_only(pattern=importKey, header=0, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] origKey = parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=origKey) h2o_cmd.infoFromInspect(inspect, origKey) execExpr = 'newKey = '+origKey+'[1,1]' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) newParseKey = {'destination_key': 'newKey'} h2o_cmd.checkKeyDistribution() h2o.nodes[0].remove_key(key=origKey) # a key isn't created for a scalar # h2o.nodes[0].remove_key(key='newKey') self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
def test_parse_nflx_loop_hdfs_fvec(self): h2o.beta_features = True print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz # default csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 500 importFolderPath = "datasets/airlines_multi" csvPathname = importFolderPath + "/" + csvFilePattern parseResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) for trial in range(TRIAL_MAX): # each parse now just does one csvFilePattern = "*%s.csv" % trial # if we want multifile # csvFilePattern = "*" hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() # print "Don't wait for completion. Just load things up!" print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?" print "Drat. We can't re-import the folder, if there's a parse using one of the source files?" parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_multi_exclude_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (300, 100, 'cA', 60, '*x[2-5]*'), (310, 200, 'cB', 60, '*x[1,3-5]*'), (320, 300, 'cC', 60, '*x[1-2,4-5]*'), (330, 400, 'cD', 60, '*x[1-3-5]*'), (340, 500, 'cE', 60, '*x[1-4]*'), ] ## h2b.browseTheCloud() cnum = 0 # create them all first for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # FIX! should we add a header to them randomly??? print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) for fileN in range(FILENUM): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList) for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: print f h2i.import_only(path=SYNDATASETS_DIR + "/" + f) # pattern match all, then use exclude parseResult = h2i.parse_only(pattern="*/syn_*", hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs) print "parseResult['destination_key']: " + parseResult['destination_key'] print 'parse time:', parseResult['response']['time'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # FIX! h2o strips one of the headers, but treats all the other files with headers as data numRows = inspect['numRows'] numCols = inspect['numCols'] print "\n" + parseResult['destination_key'] + ":", \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # all should have rowCount rows (due to the excludePattern self.assertEqual(numRows, rowCount*FILENUM, msg=("got numRows: %s. Should be rowCount: %s * FILENUM: %s" % \ (numRows, rowCount, FILENUM)))
def test_cols_enum_multi_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (300, 100, 'cA', 60, '*x[2-5]*'), (310, 200, 'cB', 60, '*x[1,3-5]*'), (320, 300, 'cC', 60, '*x[1-2,4-5]*'), (330, 400, 'cD', 60, '*x[1-3-5]*'), (340, 500, 'cE', 60, '*x[1-4]*'), ] ## h2b.browseTheCloud() cnum = 0 # create them all first for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # FIX! should we add a header to them randomly??? print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) for fileN in range(FILENUM): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList) for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: print f h2i.import_only(path=SYNDATASETS_DIR + "/" + f) # pattern match all, then use exclude parseResult = h2i.parse_only(pattern="*/syn_*", hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs) print "parseResult['destination_key']: " + parseResult['destination_key'] print 'parse time:', parseResult['response']['time'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # FIX! h2o strips one of the headers, but treats all the other files with headers as data num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + parseResult['destination_key'] + ":", \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) # all should have rowCount rows (due to the excludePattern self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \ (num_rows, rowCount, FILENUM)))
def test_exec2_fast_locks_overlap(self): csvPathname = "iris/iris2.csv" src_key = "iris.csv" if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only( bucket="smalldata", path=csvPathname, schema="put", src_key=src_key, timeoutSecs=10 ) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 lastHexKey = None for trial in range(1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only( bucket="smalldata", path=csvPathname, schema="put", src_key=src_key, timeoutSecs=10 ) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only( pattern=src_key, hex_key=hex_key, noPoll=True, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10 ) # wait until iteration 2, when lastHexKey is available, so you can operate on that if lastHexKey: execExpr = "%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y + 1, lastHexKey, y + 1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) lastHexKey = hex_key # since we are using the same source file, and potentially re-uploading if AVOID_BUG # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to # use it next iteration h2o_jobs.pollWaitJobs(timeoutSecs=10) # just show the jobs still going. Shouldn't be any a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_parse_airline_multi_hdfs(self): csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_airline_multi_hdfs(self): h2o.beta_features = True csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: # why is 55609 already in use?? h2o_hosts.build_cloud_with_hosts(sandbox_ignore_errors=True, force_tcp=True, java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55604, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_exec2_fast_locks_overlap(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 lastHexKey = None for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, noPoll=True, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) # wait until iteration 2, when lastHexKey is available, so you can operate on that if lastHexKey: execExpr="%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y+1, lastHexKey, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) lastHexKey = hex_key # since we are using the same source file, and potentially re-uploading if AVOID_BUG # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to # use it next iteration h2o_jobs.pollWaitJobs(timeoutSecs=10) # just show the jobs still going. Shouldn't be any a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_parse_all_s3n_thru_hdfs(self): print "\nLoad a list of files from s3n, parse it thru HDFS" print "In EC2, michal's config always passes the right config xml" print "as arg to the java -jar h2o.jar. Only works in EC2" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/*' importResult = h2i.import_only(bucket=bucket, path=csvPathname, schema='s3n') s3nFullList = importResult['succeeded'] print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList), 1, "Didn't see more than 1 files in s3n?") s3nList = random.sample(s3nFullList, 8) timeoutSecs = 500 for s in s3nList: s3nKey = s['key'] s3nFilename = s['file'] # there is some non-file key names returned? s3n metadata? # only use the keys with csv in their name if ('csv' not in s3nKey) or ('syn_dataset' in s3nKey) or ('.gz' in s3nKey): continue # creates csvFilename.hex from file in hdfs dir print "Loading s3n key: ", s3nKey, 'thru HDFS' parseResult = h2i.parse_only(pattern=s3nKey, hex_key=s3nFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print s3nFilename, 'parse time:', parseResult['response']['time'] print "parse result:", parseResult['destination_key'] start = time.time() sys.stdout.flush()
def test_parse_multi_header_single_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception( "Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() parseResult = h2i.parse_only(pattern='*' + rowxcol + '*', hex_key=hex_key, timeoutSecs=timeoutSecs, header="1", header_from_file=header) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], totalCols, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE' } else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def test_exec_enums_rand_cut2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1, MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1 == 1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0, 1) == 0: cutExprList.append('p$C' + str(i + 1) + '!=' + c) else: cutExprList.append('p$C' + str(i + 1) + '==' + c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount + 1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J' + src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*' + src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception( "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1 == 1: a = 'a=c(1,2,3);' + ';'.join( ['a[,%s]=a[,%s-1]' % (i, i) for i in range(2, colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C' + str(iColCount + 1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print( hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount + 1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1,64) # random length headers headerName = ''.join([random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0,1) GZIP_HEADER = random.randint(0,1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k,v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN==" " or SEP_CHAR_GEN==",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0,1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0,1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*' else: pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1==0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf} start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_parse_multi_header_rand(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(20): l = random.randint(1, 64) # random length headers headerName = ''.join( [random.choice(allowedLetters) for _ in range(l)]) headerChoices.append(headerName) # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ # FIX! one fails count for now # (1, 5, 9, 'cA', 60, 0), (1, 5, 9, 'cA', 60, 0), (1, 5, 25, 'cA', 60, 0), # try with col mismatch on header. # FIX! causes exception? don't test for now # (7, 300, 10, 'cA', 60, 0), # (7, 300, 10, 'cB', 60, 1), # (7, 300, 10, 'cC', 60, 2), # (7, 300, 10, 'cD', 60, 3), # (7, 300, 8, 'cA', 60, 0), # (7, 300, 8, 'cB', 60, 1), # (7, 300, 8, 'cC', 60, 2), # (7, 300, 8, 'cD', 60, 3), ] # so many random combos..rather than walk tryList, just do random for some amount of time for trial in range(50): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 # DATA_HAS_HDR_ROW = random.randint(0,1) DATA_HAS_HDR_ROW = 0 # PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = 0 ## DATA_FIRST_IS_COMMENT = random.randint(0,1) ## HEADER_FIRST_IS_COMMENT = random.randint(0,1) print "TEMPORARY: don't put any comments in" DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 # none is not legal # SEP_CHAR_GEN = random.choice(paramsDict['separator']) SEP_CHAR_GEN = "\t" print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', SEP_CHAR_GEN # they need to both use the same separator (h2o rule) hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"] print hh print "UPDATE: always use comma (space legal also?) for header separator?? it should work no matter what separator the data uses?" headerForHeader = ",".join(hh) # make these different hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"] headerForData = SEP_CHAR_GEN.join(hh) # random selection of parse param choices kwargs = {} for k, v in paramsDict.items(): aChoice = random.choice(v) # can tell h2o something different compared to what we actually used! if k == 'separator': if aChoice: sepChar = aChoice sepCharInt = ord(aChoice) # make it an integer for h2o else: sepChar = ',' # default char for None, need it for header/data file creation sepCharInt = None aChoice = sepCharInt kwargs[k] = aChoice # FOR NOW: ..override the rand choice if it exists, so we can parse and expect 'A' to be found # match what was gen'ed if choice is not None if kwargs['separator']: if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",": # parse doesn't auto-detect tab. will autodetect space and comma del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # create data files for fileN in range(fileNum): csvFilename = 'syn_data_' + str(fileN) + "_" + str( SEED) + "_" + str(trial) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset( csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone # create the header file hdrFilename = 'syn_header_' + str(SEED) + "_" + str( trial) + "_" + rowxcol + '.csv' hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset( hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) if PARSE_PATTERN_INCLUDES_HEADER: # only include header file data rows if the parse pattern includes it totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w xs = h2o.nodes[0].import_files(SYNDATASETS_DIR)['keys'] headerKey = [x for x in xs if hdrFilename in x][0] dataKey = [x for x in xs if csvFilename not in x][0] # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'syn_header': kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'syn_data': kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None print "If header_from_file= is used, we are currently required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = '*syn_*' + str(trial) + "_" + rowxcol + '*' else: pattern = '*syn_data_*' + str(trial) + "_" + rowxcol + '*' parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone == 0) and (kwargs['header'] == 1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData ### print (headerRowsDone!=0), (kwargs['header']==1), DATA_HAS_HDR_ROW, (kwargs['header_from_file'] is not None) if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows (header rows don't count) h2o: %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it h2oShouldSeeHeader = (HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)) or DATA_HAS_HDR_ROW if h2oShouldSeeHeader: kwargs = {'sample': 75, 'depth': 25, 'ntree': 1, 'ignore': 'A'} else: kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def test_parse_airline_multi_hdfs(self): h2o.beta_features = True csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: # why is 55609 already in use?? h2o_hosts.build_cloud_with_hosts( sandbox_ignore_errors=True, force_tcp=True, java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55604, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_multi_header_single_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception("Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() parseResult = h2i.parse_only(pattern='*'+rowxcol+'*', hex_key=hex_key, timeoutSecs=timeoutSecs, header="1", header_from_file=header) print "parseResult['destination_key']: " + parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE'} else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def test_parse_multi_header_rand(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(20): l = random.randint(1,64) # random length headers headerName = ''.join([random.choice(allowedLetters) for _ in range(l)]) headerChoices.append(headerName) # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ # FIX! one fails count for now # (1, 5, 9, 'cA', 60, 0), (1, 5, 9, 'cA', 60, 0), (1, 5, 25, 'cA', 60, 0), # try with col mismatch on header. # FIX! causes exception? don't test for now # (7, 300, 10, 'cA', 60, 0), # (7, 300, 10, 'cB', 60, 1), # (7, 300, 10, 'cC', 60, 2), # (7, 300, 10, 'cD', 60, 3), # (7, 300, 8, 'cA', 60, 0), # (7, 300, 8, 'cB', 60, 1), # (7, 300, 8, 'cC', 60, 2), # (7, 300, 8, 'cD', 60, 3), ] # so many random combos..rather than walk tryList, just do random for some amount of time for trial in range(50): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 # DATA_HAS_HDR_ROW = random.randint(0,1) DATA_HAS_HDR_ROW = 0 # PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = 0 ## DATA_FIRST_IS_COMMENT = random.randint(0,1) ## HEADER_FIRST_IS_COMMENT = random.randint(0,1) print "TEMPORARY: don't put any comments in" DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 # none is not legal # SEP_CHAR_GEN = random.choice(paramsDict['separator']) SEP_CHAR_GEN = "\t" print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', SEP_CHAR_GEN # they need to both use the same separator (h2o rule) hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"] print hh print "UPDATE: always use comma (space legal also?) for header separator?? it should work no matter what separator the data uses?" headerForHeader = ",".join(hh) # make these different hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"] headerForData = SEP_CHAR_GEN.join(hh) # random selection of parse param choices kwargs = {} for k,v in paramsDict.items(): aChoice = random.choice(v) # can tell h2o something different compared to what we actually used! if k == 'separator': if aChoice: sepChar = aChoice sepCharInt = ord(aChoice) # make it an integer for h2o else: sepChar = ',' # default char for None, need it for header/data file creation sepCharInt = None aChoice = sepCharInt kwargs[k] = aChoice # FOR NOW: ..override the rand choice if it exists, so we can parse and expect 'A' to be found # match what was gen'ed if choice is not None if kwargs['separator']: if SEP_CHAR_GEN==" " or SEP_CHAR_GEN==",": # parse doesn't auto-detect tab. will autodetect space and comma del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # create data files for fileN in range(fileNum): csvFilename = 'syn_data_' + str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone # create the header file hdrFilename = 'syn_header_' + str(SEED) + "_" + str(trial) + "_" + rowxcol + '.csv' hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) if PARSE_PATTERN_INCLUDES_HEADER: # only include header file data rows if the parse pattern includes it totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w xs = h2o.nodes[0].import_files(SYNDATASETS_DIR)['keys'] headerKey = [x for x in xs if hdrFilename in x][0] dataKey = [x for x in xs if csvFilename not in x][0] # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'syn_header': kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'syn_data': kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None print "If header_from_file= is used, we are currently required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = '*syn_*'+str(trial)+"_"+rowxcol+'*' else: pattern = '*syn_data_*'+str(trial)+"_"+rowxcol+'*' parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult['destination_key'] print 'parse time:', parseResult['response']['time'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # more reporting: (we can error here if extra col in header, causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData ### print (headerRowsDone!=0), (kwargs['header']==1), DATA_HAS_HDR_ROW, (kwargs['header_from_file'] is not None) if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 self.assertEqual(inspect['num_rows'], totalDataRows, "parse created result with the wrong number of rows (header rows don't count) h2o: %s gen'ed: %s" % \ (inspect['num_rows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it h2oShouldSeeHeader = (HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)) or DATA_HAS_HDR_ROW if h2oShouldSeeHeader: kwargs = {'sample': 75, 'depth': 25, 'ntree': 1, 'ignore': 'A'} else: kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception( "Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only( pattern='*' + rowxcol + '*', hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE' } else: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1 } rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def test_exec_enums_rand_cut2(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1,MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0,1)==0: cutExprList.append('p$C'+str(i+1)+'!='+c) else: cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount+1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J'+src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*'+src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C'+str(iColCount+1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print(hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount+1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception("Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only(pattern='*'+rowxcol+'*', hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE'} else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1, 64) # random length headers headerName = ''.join( [random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0, 1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0, 1) GZIP_HEADER = random.randint(0, 1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k, v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0, 1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0, 1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset( csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename( csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset( hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename( hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*' else: pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone == 0) and (kwargs['header'] == 1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1 == 0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = { 'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf } start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')