def test_GLM2_tnc3_10(self): h2o.beta_features = True csvFilename = 'tnc3_10.csv' print "\n" + csvFilename hex_key = "tnc3.hex" h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1==0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseResult # in any case, the destination_key in parseResult was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1==0): colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1==1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_tnc3_ignore(self): csvFilename = 'tnc3_10.csv' csvPathname = h2o.find_file('smalldata/' + csvFilename) print "\n" + csvPathname key2 = "tnc3.hex" h2b.browseTheCloud() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10) print "Parse result['Key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1==0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): start = time.time() kwargs = {'y': 13, 'num_cross_validation_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseKey # in any case, the destination_key in parseKey was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1==0): colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1==1): start = time.time() kwargs = {'y': 13, 'num_cross_validation_folds': 6} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_tnc3_ignore(self): csvFilename = "tnc3.csv" csvPathname = h2o.find_file("smalldata/" + csvFilename) print "\n" + csvPathname key2 = "tnc3.hex" h2b.browseTheCloud() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10, header=1) print "Parse result['Key']:", parseKey["destination_key"] inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if 1 == 0: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols( lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10 ) print "\ncolResultList after num swap", colResultList if 1 == 1: print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser" print 'The good case with ignore="boat,body"' rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, ignore="boat,body", csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # ****************** if 1 == 0: colResultList = h2e.exec_expr_list_across_cols( lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10 ) print "\ncolResultList after char swap", colResultList if 1 == 1: print "\nNow the bad case (no ignore)" rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_tnc3_ignore(self): csvPathname = 'tnc3.csv' print "\n" + csvPathname hex_key = "tnc3.hex" ### h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10, header=1) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if 1==1: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser" print 'The good case with ignore="boat,body"' rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, ignore="boat,body") inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if 1==0: colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if 1==1: print "\nNow the bad case (no ignore)" rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_tnc3_ignore(self): csvFilename = 'tnc3.csv' csvPathname = h2o.find_file('smalldata/' + csvFilename) print "\n" + csvPathname key2 = "tnc3.hex" h2b.browseTheCloud() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10, header=1) print "Parse result['Key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if 1==1: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser" print 'The good case with ignore="boat,body"' rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, ignore="boat,body", csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if 1==0: colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if 1==1: print "\nNow the bad case (no ignore)" rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_GLM2_tnc3_10(self): h2o.beta_features = True csvFilename = "tnc3_10.csv" print "\n" + csvFilename hex_key = "tnc3.hex" parseResult = h2i.import_parse( bucket="smalldata", path=csvFilename, schema="put", hex_key=hex_key, timeoutSecs=10 ) print "Parse result['Key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) ### time.sleep(10) if 1 == 0: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols( lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10 ) print "\ncolResultList after num swap", colResultList if 1 == 1: start = time.time() kwargs = {"response": 13, "n_folds": 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseResult # in any case, the destination_key in parseResult was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, "took", time.time() - start, "seconds" inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) # ****************** if 1 == 0: colResultList = h2e.exec_expr_list_across_cols( lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10 ) print "\ncolResultList after char swap", colResultList if 1 == 1: start = time.time() kwargs = {"response": 13, "n_folds": 6} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, "took", time.time() - start, "seconds" inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
def test_GLM_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA.hex', 100), (10000, 20, 'cB.hex', 200), (10000, 30, 'cC.hex', 300), (10000, 40, 'cD.hex', 400), (10000, 50, 'cE.hex', 500), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))', ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, timeoutSecs=90, incrementingResult=False) print "\nexec colResultList", colResultList paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] # since we add the output twice, it's no longer colCount-1 y = colCount kwargs = {'y': y, 'max_iter': 50, 'case': 1} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' # only col y-1 (next to last)doesn't get renamed in coefficients # due to enum/categorical expansion print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(3) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 # ("covtype20x.data", "cD", 50, 20), # ("covtype200x.data", "cE", 50, 200), csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( lenNodes, exprList, key2, minCol=0, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_exec2_int2cat_nested(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 10, 'cA', 100), (1000, 20, 'cB', 100), (1000, 30, 'cC', 100), (1000, 40, 'cD', 100), (1000, 10, 'cE', 100), (1000, 20, 'cF', 100), (1000, 30, 'cG', 100), (1000, 40, 'cH', 100), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);', '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);' \ ] exprList = [ '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);', '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);' \ ] exprList = [ '<keyX>[,2] = factor(<keyX>[,2])', ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename print "\nNow running the exec commands across all input cols" colResultList = h2e.exec_expr_list_across_cols(None, exprList, hex_key, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 10, 'cA', 100), (1000, 20, 'cB', 100), (1000, 30, 'cC', 100), (1000, 40, 'cD', 100), (1000, 10, 'cE', 100), (1000, 20, 'cF', 100), (1000, 30, 'cG', 100), (1000, 40, 'cH', 100), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! if 1 == 0: exprList = [ '<keyX> = colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' + 'colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' + '<keyX>[0]' + '))))', ] else: exprList = [ '<keyX> = colSwap(<keyX>,<col1>,' + '<keyX>[0]' + ')', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols( None, exprList, key2, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_sum_import_hosts(self): # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 # ("covtype20x.data", "cD", 50, 20), # ("covtype200x.data", "cE", 50, 200), csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ## h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = "standard" firstDone = False for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( lenNodes, exprList, hex_key, minCol=0, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_rf_tnc3_fvec(self): h2o.beta_features = True csvPathname = 'tnc3.csv' print "\n" + csvPathname hex_key = "tnc3.hex" ### h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10, retryDelaySecs=0.25, header=1) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") if 1==1: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser" print 'The good case with ignore="boat,body"' rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, retryDelaySecs=0.25, ignored_cols_by_name="boat,body") inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrl(retryDelaySecs=0.5) #****************** if 1==0: colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10, retryDelaySecs=0.25) print "\ncolResultList after char swap", colResultList if 1==1: print "\nNow the bad case (no ignore)" rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, retryDelaySecs=0.25) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrl(retryDelaySecs=0.5)
def test_exec2_int2cat_nested(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 10, 'cA', 100), (1000, 20, 'cB', 100), (1000, 30, 'cC', 100), (1000, 40, 'cD', 100), (1000, 10, 'cE', 100), (1000, 20, 'cF', 100), (1000, 30, 'cG', 100), (1000, 40, 'cH', 100), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>[,<col2>] = <keyX>[,<col1>];', '<keyX>[,<col1>] = <keyX>[,1];', '<keyX>[,1] = <keyX>[,<col2>];', '<keyX>[,<col2>] = <keyX>[,<col1>];', '<keyX>[,<col1>] = <keyX>[,1];', '<keyX>[,1] = <keyX>[,<col2>];' \ ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename print "\nNow running the exec commands across all input cols" colResultList = h2e.exec_expr_list_across_cols( None, exprList, hex_key, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] else: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype20x.data", "cD", 50, 20), ("covtype200x.data", "cE", 50, 200), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 10, 'cA', 100), (1000, 20, 'cB', 100), (1000, 30, 'cC', 100), (1000, 40, 'cD', 100), (1000, 10, 'cE', 100), (1000, 20, 'cF', 100), (1000, 30, 'cG', 100), (1000, 40, 'cH', 100), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! if 1==0: exprList = [ '<keyX> = colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' + 'colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' + '<keyX>[0]' + '))))', ] else: exprList = [ '<keyX> = colSwap(<keyX>,<col1>,' + '<keyX>[0]' + ')', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' + filename1x) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, key2, timeoutSecs, resultMult) in csvAll: parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=2000) print "Parse result['Key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_int2cat_factor_factor(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA.hex', 100), (10000, 20, 'cB.hex', 200), (10000, 30, 'cC.hex', 300), (10000, 40, 'cD.hex', 400), (10000, 50, 'cE.hex', 500), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[0]))', ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[0])', ### '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))', ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])', ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=90) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols(None, exprList, hex_key, maxCol=colCount, timeoutSecs=90, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('datasets', 'UCI/UCI-large/covtype/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_many_fp_formats_libsvm (self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30,'sparse'), (100, 100, 'cF', 30,'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0,47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use num_cols?. num_cols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols)) syn = {} if k==0: syn['name'] = "Target" syn['size'] = {1,2} # can be two if we actually used the full range 0-255 (need extra for h2o NA) syn['type'] = {'int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' syn['scale'] = {1} # syn['base'] = 0 # syn['variance'] = 0 elif k==1: # we forced this to always be 0 syn['name'] = "V" + str(k) syn['size'] = {1} syn['type'] = {'int'} syn['min'] = 0 syn['max'] = 0 syn['scale'] = {1} syn['base'] = 0 syn['variance'] = 0 else: syn['name'] = "V" + str(k) syn['size'] = {1,2,4,8} # can be 2, 4 or 8? maybe make this a set for membership check syn['type'] = {'int', 'float'} syn['min'] = valMin syn['max'] = valMax syn['scale'] = {1,10,100,1000} # syn['base'] = 0 # syn['variance'] = 0 syn['num_missing_values'] = 0 syn['enum_domain_size'] = 0 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue(syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue(syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'size' or synKey == 'scale' or synKey == 'type': if cols[synKey] not in syn[synKey]: # for debug of why it was a bad size print "cols size/min/max:", cols['size'], cols['min'], cols['max'] print "syn size/min/max:", syn['size'], syn['min'], syn['max'] raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual(syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_exec2_int2cat_nested(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 10, 'cA', 100), (100000, 20, 'cB', 100), (100000, 30, 'cC', 100), ] # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! if DO_CASE == 0: exprList = [ '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);', '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);' \ ] elif DO_CASE == 1: exprList = [ '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);', '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);' \ ] elif DO_CASE == 2: exprList = [ '<keyX>[,2] = factor(<keyX>[,2])', ] else: raise Exception("Bad case: %s" % DO_CASE) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename print "\nNow running the exec commands across all input cols" colResultList = h2e.exec_expr_list_across_cols( None, exprList, hex_key, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList
def test_GLM_tnc3_10(self): csvFilename = 'tnc3_10.csv' print "\n" + csvFilename hex_key = "tnc3.hex" h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1 == 0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols( lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1 == 1): start = time.time() kwargs = {'y': 13, 'n_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseResult # in any case, the destination_key in parseResult was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1 == 0): colResultList = h2e.exec_expr_list_across_cols( lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1 == 1): start = time.time() kwargs = {'y': 13, 'n_folds': 6} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_GLM_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA.hex', 100), (10000, 20, 'cB.hex', 200), (10000, 30, 'cC.hex', 300), (10000, 40, 'cD.hex', 400), (10000, 50, 'cE.hex', 500), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))', ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols( None, exprList, key2, maxCol=colCount, timeoutSecs=90, incrementingResult=False) print "\nexec colResultList", colResultList paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] # since we add the output twice, it's no longer colCount-1 y = colCount kwargs = {'y': y, 'max_iter': 50, 'case': 1} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' # only col y-1 (next to last)doesn't get renamed in coefficients # due to enum/categorical expansion print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(3) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_many_fp_formats_libsvm_fvec(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, "cA", 30, "sparse50"), (100, 10, "cB", 30, "sparse"), (100000, 100, "cC", 30, "sparse"), (1000, 10, "cD", 30, "sparse50"), (100, 100, "cE", 30, "sparse"), (100, 100, "cF", 30, "sparse50"), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset( csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution ) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) numCols = inspect["numCols"] numRows = inspect["numRows"] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult["destination_key"], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols), ) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs ) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols), ) syn = {} if k == 0: syn["name"] = "C1" syn["type"] = {"Int"} syn["min"] = classMin syn["max"] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn["name"] = "C2" syn["type"] = {"Int"} syn["min"] = 0 syn["max"] = 0 # syn['scale'] = {1} else: syn["name"] = "C" + str(k + 1) syn["type"] = {"Int", "Real"} syn["min"] = valMin syn["max"] = valMax # syn['scale'] = {1,10,100,1000} syn["naCnt"] = 0 syn["cardinality"] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect["cols"][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == "min": self.assertTrue( syn[synKey] <= cols[synKey], msg="col %s %s %s should be <= %s" % (k, synKey, cols[synKey], syn[synKey]), ) elif synKey == "max": self.assertTrue( syn[synKey] >= cols[synKey], msg="col %s %s %s should be >= %s" % (k, synKey, cols[synKey], syn[synKey]), ) elif synKey == "type": if cols[synKey] not in syn[synKey]: print "cols min/max:", cols["min"], cols["max"] print "syn min/max:", syn["min"], syn["max"] raise Exception( "col %s %s %s should be in this allowed %s" % (k, synKey, cols[synKey], syn[synKey]) ) else: self.assertEqual( syn[synKey], cols[synKey], msg="col %s %s %s should be %s" % (k, synKey, cols[synKey], syn[synKey]), ) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg="%0.6f col sum is not equal to expected %0.6f" % (v, colSum) )
def test_many_fp_formats_libsvm(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 10, 'cA', 30, 'sparse50'), (100, 10, 'cB', 30, 'sparse'), (100000, 100, 'cC', 30, 'sparse'), (1000, 10, 'cD', 30, 'sparse50'), (100, 100, 'cE', 30, 'sparse'), (100, 100, 'cF', 30, 'sparse50'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (synColSumDict, colNumberMax) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # we might have added some zeros at the end, that our colNumberMax won't include print synColSumDict.keys(), colNumberMax self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* h2e.exec_zero_list(zeroList) # how do we know the max dimension (synthetic may not generate anything for the last col) # use numCols?. numCols should be <= colCount. colSumList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs) self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset print "\ncolSumList:", colSumList print "\nsynColSumDict:", synColSumDict for k, v in synColSumDict.iteritems(): if k > colNumberMax: # ignore any extra 0 cols at the end continue # k should be integers that match the number of cols self.assertTrue( k >= 0 and k < len(colSumList), msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols)) syn = {} if k == 0: syn['name'] = "C1" syn['type'] = {'Int'} syn['min'] = classMin syn['max'] = classMax # don't check these for the col 0 'Target' # syn['scale'] = {1} elif k == 1: # we forced this to always be 0 syn['name'] = "C2" syn['type'] = {'Int'} syn['min'] = 0 syn['max'] = 0 # syn['scale'] = {1} else: syn['name'] = "C" + str(k + 1) syn['type'] = {'Int', 'Real'} syn['min'] = valMin syn['max'] = valMax # syn['scale'] = {1,10,100,1000} syn['naCnt'] = 0 syn['cardinality'] = -1 # syn['min'] = 0 # syn['max'] = 0 # syn['mean'] = 0 cols = inspect['cols'][k] for synKey in syn: # we may not see the min/max range of values that was bounded by our gen, but # we can check that it's a subset of the allowed range if synKey == 'min': self.assertTrue( syn[synKey] <= cols[synKey], msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'max': self.assertTrue( syn[synKey] >= cols[synKey], msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey])) elif synKey == 'type': if cols[synKey] not in syn[synKey]: print "cols min/max:", cols['min'], cols['max'] print "syn min/max:", syn['min'], syn['max'] raise Exception( 'col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey])) else: self.assertEqual( syn[synKey], cols[synKey], msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey])) colSum = colSumList[k] print "\nComparing col", k, "sums:", v, colSum # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( float(v), colSum, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k,v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k+1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json(resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ",k , "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception('col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))
def test_many_fp_formats_libsvm_2 (self): h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), (100, 40000, 'cC', 300, 'sparse50'), (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs, distribution) in tryList: # for sel in range(48): # len(caseList) for sel in [random.randint(0,47)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs, doSummary=False) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseKey['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, num_cols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset ### print "\nsynColSumDict:", synColSumDict for k,v in synColSumDict.iteritems(): if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = inspect['cols'][k]['mean'] # our fp formats in the syn generation sometimes only have two places? self.assertAlmostEqual(mean, synMean, places=0, msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) num_missing_values = inspect['cols'][k]['num_missing_values'] self.assertEqual(0, num_missing_values, msg='col %s num_missing_values %d should be 0' % (k, num_missing_values))
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary( key=selKey2, max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k, v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k >= 0 and k < len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0) / rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k + 1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json( resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception( 'col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))
def test_GLM2_tnc3_10(self): h2o.beta_features = True csvFilename = 'tnc3_10.csv' print "\n" + csvFilename hex_key = "tnc3.hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### time.sleep(10) if (1 == 0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols( lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1 == 1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseResult # in any case, the destination_key in parseResult was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) #****************** if (1 == 0): colResultList = h2e.exec_expr_list_across_cols( lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1 == 1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])