def test_factor_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() # use SEED so the file isn't cached? csvFilenameAll = [ ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 1mx8 csv" write_syn_dataset(csvPathname, 1000000, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def exec_list(exprList, lenNodes, csvFilename, key2): h2e.exec_zero_list(zeroList) # start with trial = 1 because trial-1 is used to point to Result0 which must be initted trial = 1 while (trial < 100): for exprTemplate in exprList: # do each expression at a random node, to facilate key movement nodeX = random.randint(0,lenNodes-1) colX = random.randint(1,54) # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now row = str(random.randint(1,400000)) execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, key2) execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result"+str(trial)+".hex", timeoutSecs=60) eri0 = execResultInspect[0] eri1 = execResultInspect[1] columns = eri0.pop('cols') columnsDict = columns[0] print "\nexecResult columns[0]:", h2o.dump_json(columnsDict) print "\nexecResult [0]:", h2o.dump_json(eri0) print "\nexecResult [1] :", h2o.dump_json(eri1) min = columnsDict["min"] h2o.verboseprint("min: ", min, "trial:", trial) ### self.assertEqual(float(min), float(trial),"what can we check here") ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation h2o.check_sandbox_for_errors() print "Trial #", trial, "completed\n" trial += 1
def test_slice(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def exec_list(exprList, lenNodes, csvFilename, hex_key, colX): h2e.exec_zero_list(zeroList) # start with trial = 1 because trial-1 is used to point to Result0 which must be initted trial = 1 while (trial < 100): for exprTemplate in exprList: # do each expression at a random node, to facilate key movement nodeX = random.randint(0, lenNodes - 1) # billion rows only has two cols # colX is incremented in the fill_in_expr_template # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now row = str(random.randint(1, 400000)) execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, hex_key) execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result" + str(trial) + ".hex", timeoutSecs=60) h2o.check_sandbox_for_errors() print "Trial #", trial, "completed\n" trial += 1
def test_slice(self): importFolderPath = "standard" csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['desination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, hex_key, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_dkv(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilenameAll = [ ("syn_10x8.csv", 'cA', 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 10x8 csv" write_syn_dataset(csvPathname, 10, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=6, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_exec2_col_scalar(self): h2o.beta_features = True # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [("covtype.data", "cA", 15)] else: maxTrials = 20 csvFilenameAll = [("covtype.data", "cA", 15), ("covtype20x.data", "cC", 60)] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, timeoutSecs=2000 ) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand( lenNodes, exprList, hex_key, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs )
def test_exec_import_hosts(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_exec2_col_scalar(self): # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 maxTrials = 20 csvFilenameAll = [ ("covtype.data", 15), ("covtype20x.data", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" # just always use the same hex_key, so the zeroList is right all the time hex_key = 'cA' for (csvFilename, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec2_dkv(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilenameAll = [ ("syn_10x8.csv", 'cA', 15), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 10x8 csv" write_syn_dataset(csvPathname, 10, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=6, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_exec_import_hosts(self): # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ("covtype.data", "cB", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 15), ("covtype20x.data", "cC", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: # import each time, because h2o deletes source file after parse # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 # ("covtype20x.data", "cD", 50, 20), # ("covtype200x.data", "cE", 50, 200), csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( lenNodes, exprList, key2, minCol=0, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_sum_import_hosts(self): # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 # ("covtype20x.data", "cD", 50, 20), # ("covtype200x.data", "cE", 50, 200), csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" firstDone = False for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( lenNodes, exprList, hex_key, minCol=0, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] else: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype20x.data", "cD", 50, 20), ("covtype200x.data", "cE", 50, 200), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_exec2_constants(self): print "Create some vectors from a constant" print "Don't really need a dataset, but .." for i in range(10): h2e.exec_zero_list(zeroList) inspect = h2o_cmd.runInspect(key="Result9") h2o_cmd.infoFromInspect(inspect, "Result9") numRows = inspect["numRows"] numCols = inspect["numCols"] self.assertEqual(numRows, 1000000) self.assertEqual(numCols, 1)
def test_exec2_constants(self): print "Create some vectors from a constant" print "Don't really need a dataset, but .." for i in range(10): h2e.exec_zero_list(zeroList) inspect = h2o_cmd.runInspect(key='Result9') h2o_cmd.infoFromInspect(inspect, 'Result9') numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numRows, 1000000) self.assertEqual(numCols, 1)
def test_loop_random_exec_covtype(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=15) print "\nParse key is:", parseResult['destination_key'] # h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_exec2_constants(self): print "Create some vectors from a constant" print "Don't really need a dataset, but .." h2o.beta_features = True for i in range(10): h2e.exec_zero_list(zeroList) inspect = h2o_cmd.runInspect(key='Result9') h2o_cmd.infoFromInspect(inspect, 'Result9') numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numRows, 1000000) self.assertEqual(numCols, 1)
def test_loop_random_exec_covtype(self): csvPathname = 'UCI/UCI-large/covtype/covtype.data' parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=15) print "\nParse key is:", parseResult['destination_key'] # h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_loop_random_exec_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 15) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=5) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (csvFilename, key2, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_vector_filter_factor(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype20x.data", "cC", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # have to import each time, because h2o deletes the source file after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec2_covtype_rand1(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=15) print "\nParse key is:", parseResult['destination_key'] ### h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=10) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_loop_random_exec_covtype(self): csvPathname = "UCI/UCI-large/covtype/covtype.data" parseResult = h2i.import_parse( bucket="datasets", path=csvPathname, schema="put", hex_key="c.hex", timeoutSecs=15 ) print "\nParse key is:", parseResult["destination_key"] h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand( len(h2o.nodes), exprList, "c.hex", maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=10 ) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", "took", time.time() - start, "seconds"
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 15), ("covtype20x.data", "cC", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' + filename1x) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, key2, timeoutSecs, resultMult) in csvAll: parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=2000) print "Parse result['Key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_exec_2(self): # exec2 doesn't autoframe? fvec everything h2o.beta_features = True # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA.hex", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA.hex", 15), ("covtype20x.data", "cA.hex", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['desination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_slice(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_loop_random_exec_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 15) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", 'took', time.time( ) - start, 'seconds'
def exec_list(exprList, lenNodes, csvFilename, hex_key): h2e.exec_zero_list(zeroList) # start with trial = 1 because trial-1 is used to point to Result0 which must be initted trial = 1 while (trial < 100): for exprTemplate in exprList: # do each expression at a random node, to facilate key movement nodeX = random.randint(0,lenNodes-1) colX = random.randint(1,54) # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now row = str(random.randint(1,400000)) execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, hex_key) execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result"+str(trial)+".hex", timeoutSecs=60) h2o.check_sandbox_for_errors() print "Trial #", trial, "completed\n" trial += 1
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (csvFilename, key2, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_vector_filter_factor(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype20x.data", "cC", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # have to import each time, because h2o deletes the source file after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('datasets', 'UCI/UCI-large/covtype/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_exec_import_hosts(self): # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" cnum = 0 for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ("covtype20x.data", "cC", 50), ("covtype20x.data", "cD", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_exec_import_hosts(self): # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2o_import.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def exec_list(exprList, lenNodes, csvFilename, key2): h2e.exec_zero_list(zeroList) # start with trial = 1 because trial-1 is used to point to Result0 which must be initted trial = 1 while (trial < 100): for exprTemplate in exprList: # do each expression at a random node, to facilate key movement nodeX = random.randint(0, lenNodes - 1) colX = random.randint(1, 54) # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now row = str(random.randint(1, 400000)) execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, key2) execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result" + str(trial) + ".hex", timeoutSecs=60) eri0 = execResultInspect[0] eri1 = execResultInspect[1] columns = eri0.pop('cols') columnsDict = columns[0] print "\nexecResult columns[0]:", h2o.dump_json(columnsDict) print "\nexecResult [0]:", h2o.dump_json(eri0) print "\nexecResult [1] :", h2o.dump_json(eri1) min = columnsDict["min"] h2o.verboseprint("min: ", min, "trial:", trial) ### self.assertEqual(float(min), float(trial),"what can we check here") ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # slows things down to check every iteration, but good for isolation h2o.check_sandbox_for_errors() print "Trial #", trial, "completed\n" trial += 1
def test_exec_2(self): # exec2 doesn't autoframe? fvec everything # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if h2o.localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA.hex", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA.hex", 15), ("covtype20x.data", "cA.hex", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" for (csvFilename, hex_key, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['desination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_many_fp_formats_libsvm_fvec(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, "cA", 30, "sparse50"), (100, 10, "cB", 30, "sparse"), (100000, 100, "cC", 30, "sparse"), (1000, 10, "cD", 30, "sparse50"), (100, 100, "cE", 30, "sparse"), (100, 100, "cF", 30, "sparse50"), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset( csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution ) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) numCols = inspect["numCols"] numRows = inspect["numRows"] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult["destination_key"], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols), ) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs ) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols), ) syn = {} if k == 0: syn["name"] = "C1" syn["type"] = {"Int"} syn["min"] = classMin syn["max"] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn["name"] = "C2" syn["type"] = {"Int"} syn["min"] = 0 syn["max"] = 0 # syn['scale'] = {1} else: syn["name"] = "C" + str(k + 1) syn["type"] = {"Int", "Real"} syn["min"] = valMin syn["max"] = valMax # syn['scale'] = {1,10,100,1000} syn["naCnt"] = 0 syn["cardinality"] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect["cols"][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == "min": self.assertTrue( syn[synKey] <= cols[synKey], msg="col %s %s %s should be <= %s" % (k, synKey, cols[synKey], syn[synKey]), ) elif synKey == "max": self.assertTrue( syn[synKey] >= cols[synKey], msg="col %s %s %s should be >= %s" % (k, synKey, cols[synKey], syn[synKey]), ) elif synKey == "type": if cols[synKey] not in syn[synKey]: print "cols min/max:", cols["min"], cols["max"] print "syn min/max:", syn["min"], syn["max"] raise Exception( "col %s %s %s should be in this allowed %s" % (k, synKey, cols[synKey], syn[synKey]) ) else: self.assertEqual( syn[synKey], cols[synKey], msg="col %s %s %s should be %s" % (k, synKey, cols[synKey], syn[synKey]), ) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg="%0.6f col sum is not equal to expected %0.6f" % (v, colSum) )
def test_many_fp_formats_libsvm (self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30,'sparse'), (100, 100, 'cF', 30,'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0,47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use num_cols?. num_cols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols)) syn = {} if k==0: syn['name'] = "Target" syn['size'] = {1,2} # can be two if we actually used the full range 0-255 (need extra for h2o NA) syn['type'] = {'int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' syn['scale'] = {1} # syn['base'] = 0 # syn['variance'] = 0 elif k==1: # we forced this to always be 0 syn['name'] = "V" + str(k) syn['size'] = {1} syn['type'] = {'int'} syn['min'] = 0 syn['max'] = 0 syn['scale'] = {1} syn['base'] = 0 syn['variance'] = 0 else: syn['name'] = "V" + str(k) syn['size'] = {1,2,4,8} # can be 2, 4 or 8? maybe make this a set for membership check syn['type'] = {'int', 'float'} syn['min'] = valMin syn['max'] = valMax syn['scale'] = {1,10,100,1000} # syn['base'] = 0 # syn['variance'] = 0 syn['num_missing_values'] = 0 syn['enum_domain_size'] = 0 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue(syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue(syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'size' or synKey == 'scale' or synKey == 'type': if cols[synKey] not in syn[synKey]: # for debug of why it was a bad size print "cols size/min/max:", cols['size'], cols['min'], cols['max'] print "syn size/min/max:", syn['size'], syn['min'], syn['max'] raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual(syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_many_fp_formats_libsvm(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30, 'sparse'), (100, 100, 'cF', 30, 'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use numCols?. numCols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols)) syn = {} if k == 0: syn['name'] = "C1" syn['type'] = {'Int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn['name'] = "C2" syn['type'] = {'Int'} syn['min'] = 0 syn['max'] = 0 # syn['scale'] = {1} else: syn['name'] = "C" + str(k + 1) syn['type'] = {'Int', 'Real'} syn['min'] = valMin syn['max'] = valMax # syn['scale'] = {1,10,100,1000} syn['naCnt'] = 0 syn['cardinality'] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue( syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue( syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'type': if cols[synKey] not in syn[synKey]: print "cols min/max:", cols['min'], cols['max'] print "syn min/max:", syn['min'], syn['max'] raise Exception( 'col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual( syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary( key=selKey2, max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k, v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k >= 0 and k < len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0) / rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k + 1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json( resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception( 'col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))
def test_many_fp_formats_libsvm_2 (self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), (100, 40000, 'cC', 300, 'sparse50'), (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0,47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs, doSummary=False) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseKey['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset ### print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = inspect['cols'][k]['mean'] # our fp formats in the syn generation sometimes only have two places? self.assertAlmostEqual(mean, synMean, places=0, msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) num_missing_values = inspect['cols'][k]['num_missing_values'] self.assertEqual(0, num_missing_values, msg='col %s num_missing_values %d should be 0' % (k, num_missing_values))
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k,v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k+1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json(resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ",k , "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception('col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))