def test_slice(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_exec_import_hosts(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = ["covtype.data"] else: csvFilenameList = [ "covtype200x.data", "covtype200x.data", "covtype.data", "covtype.data", "covtype20x.data", "covtype20x.data", ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = "/home/0xdiag/datasets/standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey["destination_key"]) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm["GLMModel"] coefficients = GLMModel["coefficients"] validationsList = GLMModel["validations"] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, "err", validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write(".") sys.stdout.flush()
def test_import_multi_syn_datasets(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) print "This imports a folder of csv files..i.e points to syn_datasets with no regex" print "Doesn't put anything in syn_datasets. When run with import folder redirected" print "to import S3, there is a syn_datasets with 100 files" print "FIX! When run locally, I should have some multi-files in", importFolderPath, "/syn_datasets?" timeoutSecs = 500 if h2o.nodes[0].redirect_import_folder_to_s3_path: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*_10000x200*", ] else: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2="syn_datasets.hex", timeoutSecs=500) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ "from all files num_rows:", "{:,}".format(inspect['num_rows']), \ "num_cols:", "{:,}".format(inspect['num_cols']) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() RFview = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) # so we can see! h2b.browseJsonHistoryAsUrlLastMatch("RFView") time.sleep(5)
def test_import_billion_rows_parse_loop(self): print "Apparently we can't handle 1B rows .gzed" csvFilename = "billion_rows.csv.gz" importFolderPath = "/home/0xdiag/datasets" trialMax = 3 for tryHeap in [4,16]: print "\n", tryHeap,"GB heap, 1 jvm per host, import folder,", \ "then loop parsing 'billion_rows.csv' to unique keys" h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap) timeoutSecs=800 for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again. ", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) print "Removing", parseKey['source_key'] removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key']) ### print "removeKeyResult:", h2o.dump_json(removeKeyResult) # sticky ports? h2o.tear_down_cloud() time.sleep(5)
def test_import_covtype_parse_loop(self): csvFilename = "covtype.data" importFolderPath = "/home/0xdiag/datasets" trialMax = 2 for tryHeap in [4, 3, 2, 1]: print "\n", tryHeap, "GB heap, 4 jvms, import folder, then loop parsing 'covtype.data' to unique keys" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=4, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts(node_count=4, java_heap_GB=tryHeap) h2i.setupImportFolder(None, importFolderPath) for trial in range(trialMax): key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(5) print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def parseFile(self, s3bucket, localbucket, pathname, timeoutSecs, header, **kwargs): if USE_LOCAL: # this can get redirected to s3/s3n by jenkins (importFolderPath, csvFilename) = os.path.split("/" + localbucket + pathname) h2i.setupImportFolder(None, importFolderPath) start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=180) else: schema = "s3n://" bucket = s3bucket URI = schema + bucket + pathname importResult = h2o.nodes[0].import_hdfs(URI) start = time.time() parseKey = h2o.nodes[0].parse("*" + pathname, timeoutSecs=timeoutSecs, header=header) parse_time = time.time() - start h2o.verboseprint("py-S3 parse took {0} sec".format(parse_time)) parseKey['python_call_timer'] = parse_time return parseKey
def test_exec_import_hosts_bigfiles(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 4000 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # Update: need unique key names apparently. can't overwrite prior parse output key? # replicating lines means they'll get reparsed. good! (but give new key names) csvFilenameList = [ ("covtype.data", "c"), ("covtype20x.data", "c20"), ("covtype200x.data", "c200"), ("billion_rows.csv.gz", "b"), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename exec_list(exprList, lenNodes, csvFilename, key2)
def test_C_kmeans_prostate(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "prostate.csv" key2 = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_exec_import_hosts_bigfiles(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 4000 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # Update: need unique key names apparently. can't overwrite prior parse output key? # replicating lines means they'll get reparsed. good! (but give new key names) csvFilenameList = [ ("covtype.data", "c"), ("covtype20x.data", "c20"), ("covtype200x.data", "c200"), ("billion_rows.csv.gz", "b"), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename exec_list(exprList, lenNodes, csvFilename, key2)
def test_parse_covtype20x_loop(self): csvFilename = "covtype20x.data" importFolderPath = "/home/0xdiag/datasets" trialMax = 2 for tryJvms in [1,2,3,4]: for tryHeap in [1,3]: print "\n", tryHeap,"GB heap,", tryJvms, "jvm per host, import folder,", \ "then loop parsing 'covtype20x.data' to unique keys" h2o_hosts.build_cloud_with_hosts(node_count=tryJvms, java_heap_GB=tryHeap) timeoutSecs=300 for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # h2o removes key after parse now ## print "Removing", parseKey['source_key'] ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key']) ### print "removeKeyResult:", h2o.dump_json(removeKeyResult) # sticky ports? h2o.tear_down_cloud() time.sleep(tryJvms * 5)
def test_import_covtype_parse_loop(self): csvFilename = "covtype.data" importFolderPath = "/home/0xdiag/datasets/standard" trialMax = 2 localhost = h2o.decide_if_localhost() for tryHeap in [4, 3, 2, 1]: print "\n", tryHeap, "GB heap, 1 jvms, import folder, then loop parsing 'covtype.data' to unique keys" if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts( node_count=1, java_heap_GB=tryHeap) for trial in range(trialMax): # import each time, because h2o deletes source file after parse h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(2)
def test_RF_poker_311M(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'new-poker-hand.full.311M.txt.gz' for trials in range(2): parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=5, depth=5, parseKey=parseKey, timeoutSecs=600, retryDelaySecs=10.0) print "RF end on ", csvFilename, 'took', time.time( ) - start, 'seconds'
def test_rf_kddcup_1999(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'kddcup_1999.data.gz' print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\ "compared to running with the parameters specified and matching the browser RF query defaults. " +\ "Also run the param for full scoring vs OOBE scoring." parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=300) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) for trials in range(4): print "\n" + csvFilename, "Trial #", trials start = time.time() kwargs = { 'response_variable': 'classifier', 'ntree': 200, 'gini': 1, 'class_weights': None, 'stratify': 0, # 'features': None, 'features': 7, 'ignore': None, 'sample': 67, 'bin_limit': 1024, 'depth': 2147483647, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': None, } if trials == 0: kwargs = {} elif trials == 1: kwargs['out_of_bag_error_estimate'] = None elif trials == 2: kwargs['out_of_bag_error_estimate'] = 0 elif trials == 3: kwargs['out_of_bag_error_estimate'] = 1 start = time.time() RFview = h2o_cmd.runRFOnly(trees=50, parseKey=parseKey, timeoutSecs=300, retryDelaySecs=1.0, **kwargs) print "RF end on ", csvFilename, 'took', time.time( ) - start, 'seconds' h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_import_covtype_parse_loop(self): csvFilename = "covtype.data" importFolderPath = "/home/0xdiag/datasets/standard" trialMax = 2 localhost = h2o.decide_if_localhost() for tryHeap in [4, 3, 2, 1]: print "\n", tryHeap, "GB heap, 1 jvms, import folder, then loop parsing 'covtype.data' to unique keys" if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap) for trial in range(trialMax): # import each time, because h2o deletes source file after parse h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(2)
def test_B_kmeans_benign(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "benign.csv" key2 = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right? parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_rf_allyears2k_oobe(self): importFolderPath = '/home/0xdiag/datasets' csvFilename = 'allyears2k.csv' csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) for trial in range(10): kwargs = paramDict timeoutSecs = 30 + kwargs['ntree'] * 2 start = time.time() # randomize the node node = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] rfView = h2o_cmd.runRFOnly(node=node, parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix']['classification_error'] rows_skipped = rfView['confusion_matrix']['rows_skipped'] mtry = rfView['mtry'] mtry_nodes = rfView['mtry_nodes'] print "mtry:", mtry print "mtry_nodes:", mtry_nodes self.assertEqual(classification_error, 0, "Should have zero oobe error") self.assertEqual(rows_skipped, 39, "Should have exactly 39 rows skipped") print "Trial #", trial, "completed"
def test_B_importFolder_files(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 1500 csvFilenameAll = [ # quick test first "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename kwargs = { 'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1 } # one coefficient is checked a little more colX = 0 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) sys.stdout.write('\n.') sys.stdout.flush()
def test_from_import_fvec(self): print "Sets h2o.beat_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True # this will redirect import and parse to the 2 variants importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) if not h2o.beta_features: print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseKey['destination_key'], timeoutSecs=30) if not h2o.beta_features: RFview = h2o_cmd.runRFOnly(trees=1, depth=25, parseKey=parseKey, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this # FIX! currently the importFolderResult is empty for fvec if 1 == 0: h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) sys.stdout.write('.') sys.stdout.flush()
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 # ("covtype20x.data", "cD", 50, 20), # ("covtype200x.data", "cE", 50, 200), csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( lenNodes, exprList, key2, minCol=0, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_B_importFolder_files(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "billion_rows.csv.gz", csvFilenameAll = [ # quick test first "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, pollTimeoutSecs=60) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 # RF seems to get memory allocation errors on single machine (16GB dram) ### RFview = h2o_cmd.runRFOnly(trees=1,depth=5,parseKey=parseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") # now some GLm kwargs = {'x': 0, 'y': 1, 'num_cross_validation_folds': 0, 'case_mode': '=', 'case': 1} # one coefficient is checked a little more colX = 0 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) sys.stdout.write('\n.') sys.stdout.flush()
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex'} start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_rf_covtype_fvec(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(jobDispatch) # don't poll for fvec rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) elapsed = time.time() - start print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print h2o.dump_json(rfResult) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = kwargs['model_key'] rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
def test_from_import(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_sum_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype.data", "cB", 5, 1), ("covtype.data", "cC", 5, 1), ] else: csvFilenameAll = [ ("covtype.data", "cA", 5, 1), ("covtype20x.data", "cD", 50, 20), ("covtype200x.data", "cE", 50, 200), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\n*************" print "colResultList", colResultList print "*************" if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_from_import(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for trial in range(3): for csvFilename in csvFilenameList: h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) elapsed = time.time() - start print csvFilename, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" print csvFilename, 'H2O reports parse time:', parseKey['response']['time'] # h2o doesn't produce this, but h2o_import.py adds it for us. print "Parse result['source_key']:", parseKey['source_key'] print "Parse result['destination_key']:", parseKey['destination_key'] print "\n" + csvFilename storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # h2o deletes key after parse now ## print "Removing", parseKey['source_key'], "so we can re-import it" ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key']) ## print "removeKeyResult:", h2o.dump_json(removeKeyResult) print "\nTrial", trial, "completed\n"
def test_short(self): csvFilename = 'part-00000b' ### csvFilename = 'short' importFolderPath = '/home/hduser/data' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, separator=9) print "Parse of", parseKey['destination_key'], "took", time.time() - start, "seconds" print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=500) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_import_multi_syn_datasets(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' print "This imports a folder of csv files..i.e points to syn_datasets with no regex" print "Doesn't put anything in syn_datasets. When run with import folder redirected" print "to import S3, there is a syn_datasets with 100 files" print "FIX! When run locally, I should have some multi-files in", importFolderPath, "/syn_datasets?" timeoutSecs = 500 if h2o.nodes[0].redirect_import_folder_to_s3_path: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*_10000x200*", ] else: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2="syn_datasets.hex", timeoutSecs=500) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ "from all files num_rows:", "{:,}".format(inspect['num_rows']), \ "num_cols:", "{:,}".format(inspect['num_cols']) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() RFview = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # so we can see! h2b.browseJsonHistoryAsUrlLastMatch("RFView") time.sleep(5)
def test_rf_kddcup_1999(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'kddcup_1999.data.gz' print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\ "compared to running with the parameters specified and matching the browser RF query defaults. " +\ "Also run the param for full scoring vs OOBE scoring." parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=300) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None,parseKey['destination_key']) for trials in range(4): print "\n" + csvFilename, "Trial #", trials start = time.time() kwargs = { 'response_variable': 'classifier', 'ntree': 200, 'gini': 1, 'class_weights': None, 'stratify': 0, # 'features': None, 'features': 7, 'ignore': None, 'sample': 67, 'bin_limit': 1024, 'depth': 2147483647, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': None, } if trials == 0: kwargs = {} elif trials == 1: kwargs['out_of_bag_error_estimate'] = None elif trials == 2: kwargs['out_of_bag_error_estimate'] = 0 elif trials == 3: kwargs['out_of_bag_error_estimate'] = 1 start = time.time() RFview = h2o_cmd.runRFOnly(trees=50,parseKey=parseKey, timeoutSecs=300, retryDelaySecs=1.0, **kwargs) print "RF end on ", csvFilename, 'took', time.time() - start, 'seconds' h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (csvFilename, key2, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_KMeans_winesPCA(self): if localhost: csvFilenameList = [ #with winesPCA2.csv speciy cols = "1,2" ('winesPCA.csv', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('winesPCA.csv', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = os.path.abspath(h2o.find_file('smalldata')) h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, 'winesPCA.csv', importFolderPath, timeoutSecs=2000, key2=key2) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'epsilon': 1e-6, 'k': 3 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) print "Expected centers: [-2.276318, -0.965151], with 59 rows." print " [0.0388763, 1.63886039], with 71 rows." print " [2.740469, -1.237816], with 48 rows." model_key = kmeans['destination_key'] kmeansScoreResult = h2o.nodes[0].kmeans_score( key = parseKey['destination_key'], model_key = model_key) score = kmeansScoreResult['score']
def test_from_import_fvec(self): print "Sets h2o.beat_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True # this will redirect import and parse to the 2 variants importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) if not h2o.beta_features: print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(key=parseKey['destination_key'], timeoutSecs=30) if not h2o.beta_features: RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) ## h2b.browseJsonHistoryAsUrlLastMatch("RFView") ## time.sleep(10) # just to make sure we test this # FIX! currently the importFolderResult is empty for fvec if 1==0: h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) sys.stdout.write('.') sys.stdout.flush()
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 800), ] else: csvFilenameList = [ ('covtype20x.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = { 'k': 1, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_from_import(self): importFolderPath = '/home/0xdiag/datasets/standard' timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll for trial in range(3): for csvFilename in csvFilenameList: h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) elapsed = time.time() - start print csvFilename, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs), "\n" print csvFilename, 'H2O reports parse time:', parseKey[ 'response']['time'] # h2o doesn't produce this, but h2o_import.py adds it for us. print "Parse result['python_source_key']:", parseKey[ 'python_source_key'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] print "\n" + csvFilename storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # h2o deletes key after parse now ## print "Removing", parseKey['python_source_key'], "so we can re-import it" ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['python_source_key']) ## print "removeKeyResult:", h2o.dump_json(removeKeyResult) print "\nTrial", trial, "completed\n"
def test_vector_filter_factor(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype20x.data", "cC", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # have to import each time, because h2o deletes the source file after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_GLM_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2, noise=('JStack', None)) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { 'cols': None, 'epsilon': 1e-4, 'k': 2 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect)
def test_C_kmeans_prostate(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "prostate.csv" key2 = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117), ([63.93984962406015], 133, 611.5187969924812), ([71.55307262569832], 179, 1474.2458100558654), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'initialization': 'Furthest', 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 15), ("covtype20x.data", "cC", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_B_importFolder_files(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 1500 csvFilenameAll = [ # quick test first "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1} # one coefficient is checked a little more colX = 0 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) sys.stdout.write('\n.') sys.stdout.flush()
def test_KMeans_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { 'cols': None, 'epsilon': 1e-4, 'k': 2, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = {'k': 1 } # 'destination_key': csvFilename + "_" + str(trial) + '.hex'} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansGridOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_slice(self): importFolderPath = "/home/0xdiag/datasets/standard" h2o_import.setupImportFolder(None, importFolderPath) csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # try the error case list # I suppose we should test the expected error is correct. # Right now just make sure things don't blow up h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, maxCol=53, maxRow=400000, maxTrials=5, timeoutSecs=timeoutSecs, ignoreH2oError=True) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
def test_exec_import_hosts(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2i.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 15), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cB", 15), ("covtype20x.data", "cD", 60), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (csvFilename, key2, timeoutSecs) in csvFilenameList: cnum += 1 # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 # we use factor in this test...so timeout has to be bigger! h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
def test_RF_poker_311M(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'new-poker-hand.full.311M.txt.gz' for trials in range(2): parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None,parseKey['destination_key']) print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=5,depth=5,parseKey=parseKey, timeoutSecs=600, retryDelaySecs=10.0) print "RF end on ", csvFilename, 'took', time.time() - start, 'seconds'
def test_vector_filter_factor(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 if localhost: maxTrials = 200 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype.data", "cB", 5), ] else: maxTrials = 20 csvFilenameAll = [ ("covtype.data", "cA", 5), ("covtype20x.data", "cC", 50), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # have to import each time, because h2o deletes the source file after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
def test_import_covtype_parse_loop(self): csvFilename = "covtype.data" importFolderPath = "/home/0xdiag/datasets" trialMax = 2 for tryHeap in [4,3,2,1]: print "\n", tryHeap,"GB heap, 2 jvms, import folder, then loop parsing 'covtype.data' to unique keys" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts(node_count=2, java_heap_GB=tryHeap) h2i.setupImportFolder(None, importFolderPath) for trial in range(trialMax): key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_rf_allyears2k_oobe(self): importFolderPath = '/home/0xdiag/datasets/standard' csvFilename = 'allyears2k.csv' csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) for trial in range(10): kwargs = paramDict timeoutSecs = 30 + kwargs['ntree'] * 2 start = time.time() # randomize the node node = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] rfView = h2o_cmd.runRFOnly(node=node, parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix'][ 'classification_error'] rows_skipped = rfView['confusion_matrix']['rows_skipped'] mtry = rfView['mtry'] mtry_nodes = rfView['mtry_nodes'] print "mtry:", mtry print "mtry_nodes:", mtry_nodes self.assertEqual(classification_error, 0, "Should have zero oobe error") self.assertEqual(rows_skipped, 39, "Should have exactly 39 rows skipped") print "Trial #", trial, "completed"
def test_exec_import_hosts(self): # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = "/home/0xdiag/datasets" h2o_import.setupImportFolder(None, importFolderPath) # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameAll = [ ("covtype.data", "cA", 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # we use colX+1 so keep it to 53 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_RF_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 100 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_reals_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "/home/0xdiag/datasets/standard" csvFilelist = [ ("covtype.data", 300), ] # IMPORT********************************************** # H2O deletes the source key. So re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) # the list could be from hdfs/s3 (ec2 remap) or local. They have to different list structures if 'succeeded' in importFolderResult: succeededList = importFolderResult['succeeded'] elif 'files' in importFolderResult: succeededList = importFolderResult['files'] else: raise Exception("Can't find 'files' or 'succeeded' in import list") ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 3, "Should see more than 3 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** key2 = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseKey['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseKey['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w") result = json.dump(storeViewResult, f, indent=4, sort_keys=True, default=str) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_GLM_100Mx70_hosts(self): # enable this if you need to re-create the file if 1 == 0: SYNDATASETS_DIR = h2o.make_syn_dir() createList = [ (100000000, 70, 'cA', 10000), ] for (rowCount, colCount, key2, timeoutSecs) in createList: csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname SEEDPERFILE = random.randint(0, sys.maxint) write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # Have to copy it to /home/0xdiag/datasets! if localhost: csvFilenameList = [ # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'), # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'), ('rand_logreg_100000000x70.csv.gz', 500, 'rand_100Mx70.hex'), ] else: # None is okay for key2 csvFilenameList = [ # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'), # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'), ('rand_logreg_100000000x70.csv.gz', 500, 'rand_100Mx70.hex'), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000, retryDelaySecs=5, initialDelaySecs=10, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + csvPathname, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) y = num_cols - 1 kwargs = { 'family': 'binomial', 'link': 'logit', 'y': y, 'max_iter': 8, 'n_folds': 0, 'beta_epsilon': 1e-4, 'alpha': 0, 'lambda': 0 } for trial in range(3): start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1==0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1==1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # for now, take too long on 2x100GB heap on 164 # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/standard' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=False) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_parse_bounds_libsvm(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 1), # FIX! fails KMeansScore ("tmc2007_train.svm", "cJ", 30, 1), ("covtype.binary.svm", "cC", 30, 1), ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 30, 1), ("mushrooms.svm", "cG", 30, 1), ("news20.svm", "cH", 30, 1), ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvPathname, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) # KMEANS****************************************** for trial in range(2): kwargs = { 'k': 3, 'epsilon': 1e-6, # 'cols': 2, # 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseKey['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)