def test_exec2_col_scalar(self): # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 maxTrials = 20 csvFilenameAll = [ ("covtype.data", 15), ("covtype20x.data", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" # just always use the same hex_key, so the zeroList is right all the time hex_key = 'cA' for (csvFilename, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec2_dkv(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilenameAll = [ ("syn_10x8.csv", 'cA', 15), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 10x8 csv" write_syn_dataset(csvPathname, 10, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=6, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_factor_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() # use SEED so the file isn't cached? csvFilenameAll = [ ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 1mx8 csv" write_syn_dataset(csvPathname, 1000000, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_exec2_na_chop(self): bucket = 'home-0xdiag-datasets' csvPathname = 'airlines/year2013.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) inspect = h2o_cmd.runInspect(key='i.hex') print "\nr.hex" \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows1 = inspect['numRows'] numCols = inspect['numCols'] for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, keyX='s.hex', maxTrials=200, timeoutSecs=30, maxCol=numCols-1) inspect = h2o_cmd.runInspect(key='s.hex') print "\ns.hex" \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows2 = inspect['numRows'] print numRows1, numRows2 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_exec2_operators2(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() bigExecExpr = "" expCnt = 0 for execExpr in exprList: bigExecExpr += execExpr + ";" h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4) expCnt += 1 # limit to 5 expressions and see what happens if expCnt > 2: bigExecExpr = "" expCnt = 0 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_exec_import_hosts(self): # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ("covtype.data", "cB", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 15), ("covtype20x.data", "cC", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: # import each time, because h2o deletes source file after parse # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec_import_hosts(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_exec_operators(self): if 1 == 1: for execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result.hex", timeoutSecs=4) else: # init with put_value for i in range(0, 5): key = "ResultUnparsed" + str(i) put = h2o.nodes[0].put_value(i, key=key, repl=None) # have to parse the key after you put_value it. put_value should parse the result first! key2 = "Result" + str(i) parse = h2o.nodes[0].parse(put['key'], key2, timeoutSecs=10) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) h2o.check_sandbox_for_errors() print "exec end on ", "operators", 'took', time.time( ) - start, 'seconds'
def test_slice(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_exec2_operators4(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=10) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() bigExecExpr = "" expCnt = 0 for t in range(200): execExpr = random.choice(exprList) bigExecExpr += execExpr + ";" h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=10) expCnt += 1 # limit to 2 expressions. # Also: functions must be solitary # Also: ifelse() must be solitary # Also: ternary operators must be solitary if expCnt > 2 or 'function' in execExpr or 'ifelse' in execExpr or "?" in execExpr: bigExecExpr = "" expCnt = 0 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_exec2_unary(self): bucket = 'home-0xdiag-datasets' csvPathname = 'airlines/year2013.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10) h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=30, allowEmptyResult=True, nanOkay=True) h2o.check_sandbox_for_errors() print "exec end on ", "operators", 'took', time.time( ) - start, 'seconds'
def test_exec2_col_scalar(self): h2o.beta_features = True # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [("covtype.data", "cA", 15)] else: maxTrials = 20 csvFilenameAll = [("covtype.data", "cA", 15), ("covtype20x.data", "cC", 60)] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, timeoutSecs=2000 ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand( lenNodes, exprList, hex_key, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs )
def test_exec2_operators4(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() bigExecExpr = "" expCnt = 0 for t in range(200): execExpr = random.choice(exprList) bigExecExpr += execExpr + ";" h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4) expCnt += 1 # limit to 2 expressions. # Also: functions must be solitary # Also: ifelse() must be solitary # Also: ternary operators must be solitary if expCnt > 2 or 'function' in execExpr or 'ifelse' in execExpr or "?" in execExpr: bigExecExpr = "" expCnt = 0 h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_slice(self): importFolderPath = "standard" csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['desination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, hex_key, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_dkv(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilenameAll = [ ("syn_10x8.csv", 'cA', 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 10x8 csv" write_syn_dataset(csvPathname, 10, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=6, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_loop_random_exec_covtype(self): csvPathname = 'UCI/UCI-large/covtype/covtype.data' parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=15) print "\nParse key is:", parseResult['destination_key'] # h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_loop_random_exec_covtype(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=15) print "\nParse key is:", parseResult['destination_key'] # h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_loop_random_exec_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 15) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=5) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (csvFilename, key2, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_exec2_operators2(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r0.hex', maxTrials=200, timeoutSecs=10) h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_exec2_operators(self): bucket = "smalldata" csvPathname = "iris/iris2.csv" hexKey = "i.hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, "r0.hex", maxTrials=200, timeoutSecs=10) h2o.check_sandbox_for_errors() print "exec end on ", "operators", "took", time.time() - start, "seconds"
def test_vector_filter_factor(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype20x.data", "cC", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # have to import each time, because h2o deletes the source file after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec2_covtype_rand1(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=15) print "\nParse key is:", parseResult['destination_key'] ### h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=10) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_exec2_operators(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' csvPathname = 'airlines/year2013.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10) h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_exec_operators(self): h2o.beta_features = True for i, execExpr in enumerate(initList): if h2o.beta_features: # no default result resultKey = "Result" + str(i) else: resultKey = "Result.hex" h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_exec2_unary(self): h2o.beta_features = True bucket = "home-0xdiag-datasets" csvPathname = "airlines/year2013.csv" hexKey = "i.hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) start = time.time() # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10) h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=30) h2o.check_sandbox_for_errors() print "exec end on ", "operators", "took", time.time() - start, "seconds"
def test_loop_random_exec_covtype(self): csvPathname = "UCI/UCI-large/covtype/covtype.data" parseResult = h2i.import_parse( bucket="datasets", path=csvPathname, schema="put", hex_key="c.hex", timeoutSecs=15 ) print "\nParse key is:", parseResult["destination_key"] h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand( len(h2o.nodes), exprList, "c.hex", maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=10 ) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", "took", time.time() - start, "seconds"
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 15), ("covtype20x.data", "cC", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec_2(self): # exec2 doesn't autoframe? fvec everything h2o.beta_features = True # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA.hex", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA.hex", 15), ("covtype20x.data", "cA.hex", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['desination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec_operators(self): if 1==1: for execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result.hex", timeoutSecs=4) else: # init with put_value for i in range(0,5): key = "ResultUnparsed" + str(i) put = h2o.nodes[0].put_value(i, key=key, repl=None) # have to parse the key after you put_value it. put_value should parse the result first! hex_key = "Result" + str(i) parse = h2o.nodes[0].parse(put['key'], hex_key, timeoutSecs=10) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_slice(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (csvFilename, key2, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_loop_random_exec_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 15) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", 'took', time.time( ) - start, 'seconds'
def test_exec2_operators2(self): bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() for execExpr in exprList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4) h2o.check_sandbox_for_errors() print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_vector_filter_factor(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype20x.data", "cC", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # have to import each time, because h2o deletes the source file after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec2_operators2(self): bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'i.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() bigExecExpr = "" expCnt = 0 for execExpr in exprList: bigExecExpr += execExpr + ";" h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4) expCnt += 1 # limit to 5 expressions and see what happens if expCnt > 2: bigExecExpr = "" expCnt = 0 h2o.check_sandbox_for_errors() print "exec end on ", "operators", 'took', time.time( ) - start, 'seconds'
def test_exec2_operators2(self): bucket = "smalldata" csvPathname = "iris/iris2.csv" hexKey = "i.hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10, allowEmptyResult=True) # now run them just concatenating each time. We don't do any template substitutes, so don't need # exec_expr_list_rand() bigExecExpr = "" for execExpr in exprBigList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4.0) h2o.check_sandbox_for_errors() print "exec end on ", "operators", "took", time.time() - start, "seconds"
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ("covtype20x.data", "cC", 50), ("covtype20x.data", "cD", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_exec_import_hosts(self): # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" cnum = 0 for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_exec_import_hosts(self): # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2o_import.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_exec_2(self): # exec2 doesn't autoframe? fvec everything # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if h2o.localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA.hex", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA.hex", 15), ("covtype20x.data", "cA.hex", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['desination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)