def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex', 'seed': 265211114317615310} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) expected = [ ([100, 100, 100], 1000000, 60028168), ([200, 200, 200], 2000000, 479913618), ([300, 300, 300], 3000000, 1619244994), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex'} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # cluster centers can return in any order centersSorted = sorted(centers, key=itemgetter(0)) self.assertAlmostEqual(centersSorted[0][0],100,delta=.2) self.assertAlmostEqual(centersSorted[1][0],200,delta=.2) self.assertAlmostEqual(centersSorted[2][0],300,delta=.2) self.assertAlmostEqual(centersSorted[0][1],100,delta=.2) self.assertAlmostEqual(centersSorted[1][1],200,delta=.2) self.assertAlmostEqual(centersSorted[2][1],300,delta=.2) self.assertAlmostEqual(centersSorted[0][2],100,delta=.2) self.assertAlmostEqual(centersSorted[1][2],200,delta=.2) self.assertAlmostEqual(centersSorted[2][2],300,delta=.2) show_results(csvPathname, parseKey, model_key, centers, 'd')
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_spheres3_" + str(SEED) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = {"k": 3, "epsilon": 1e-6, "cols": None, "destination_key": "spheres3.hex", "seed": 265211114317615310} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, "d", **kwargs) expected = [ ([100, 100, 100], 1000000, 60028168), ([200, 200, 200], 2000000, 479913618), ([300, 300, 300], 3000000, 1619244994), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'k': 2, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_KMeans_constant_col(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print "Generate synthetic dataset with first column constant = 0 and see what KMeans does" cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex") print "Parse result['destination_key']:", parseResult['destination_key'] kwargs = {'k': 2, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # check center list (first center) has same number of cols as source data self.assertEqual(colCount, len(centers[0]), "kmeans first center doesn't have same # of values as dataset row %s %s" % (colCount, len(centers[0])))
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_B_kmeans_benign(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "benign.csv" key2 = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right? parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "prostate.csv" key2 = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str,range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str, range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) ### print h2o.dump_json(inspect) # compare this kmeans to the first one. since the files are replications, the results # should be similar? KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, "cA", 5), (100, 10, "cB", 5), (100, 9, "cC", 5), (100, 8, "cD", 5), (100, 7, "cE", 5), (100, 6, "cF", 5), (100, 5, "cG", 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = "syn_" + str(SEED) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey["destination_key"] kwargs = {"k": 2, "epsilon": 1e-6, "cols": None, "destination_key": "benign_k.hex"} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, "d", **kwargs)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = {'k': 1, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex'} start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers for i in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers for i in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_B_kmeans_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") expected = [ ([ 24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117 ], 77, 46889.32010560476), ([ 25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282 ], 114, 64011.20272144667), ([ 30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667 ], 12, 13000.485226507595), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = { 'k': 3, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_KMeans_twit(self): csvFilename = "Twitter2DB.txt" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/' + csvFilename) # h2b.browseTheCloud() # parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", separator=9) # force tab sep parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers # should check the means? # FIX! have to fix these to right answers expected = [ # expected centers are from R. rest is just from h2o ([310527.2, 13433.89], 11340, None), ([5647967.1, 40487.76], 550, None), ([21765291.7, 93129.26], 14, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'max_iter': 50, 'epsilon': 1e-4, 'normalize': 0, 'cols': '0,1', 'initialization': 'Furthest', # 'initialization': 'PlusPlus', 'destination_key': 'kmeans_dest_key', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) if 1 == 0: h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore") h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply") h2b.browseJsonHistoryAsUrlLastMatch("KMeans") time.sleep(3600) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_KMeans_winesPCA(self): csvPathname = h2o.find_file('smalldata/winesPCA.csv') start = time.time() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10) print "parse end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'initialization': 'Furthest', 'k': 3, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } timeoutSecs = 480 # try the same thing 5 times for trial in range(10): start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = \ h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster # now compare expected vs actual. By sorting on center, we should be able to compare # since the centers should be separated enough to have the order be consistent expected = [ ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794), ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745), ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474), ] # multipliers on the expected values for allowed allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str, range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) expected = [([ -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154 ], num_rows, None)] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_KMeans_winesPCA(self): if localhost: csvFilenameList = [ #with winesPCA2.csv speciy cols = "1,2" ('winesPCA.csv', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('winesPCA.csv', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = os.path.abspath(h2o.find_file('smalldata')) h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, 'winesPCA.csv', importFolderPath, timeoutSecs=2000, key2=key2) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'epsilon': 1e-6, 'k': 3 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) print "Expected centers: [-2.276318, -0.965151], with 59 rows." print " [0.0388763, 1.63886039], with 71 rows." print " [2.740469, -1.237816], with 48 rows." model_key = kmeans['destination_key'] kmeansScoreResult = h2o.nodes[0].kmeans_score( key = parseKey['destination_key'], model_key = model_key) score = kmeansScoreResult['score']
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 800), ] else: csvFilenameList = [ ('covtype20x.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = { 'k': 1, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_C_kmeans_prostate(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "prostate.csv" key2 = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117), ([63.93984962406015], 133, 611.5187969924812), ([71.55307262569832], 179, 1474.2458100558654), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'initialization': 'Furthest', 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_GLM_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2, noise=('JStack', None)) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { 'cols': None, 'epsilon': 1e-4, 'k': 2 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect)
def test_kmeans_sphere5(self): SYNDATASETS_DIR = h2o.make_syn_dir() CLUSTERS = 5 SPHERE_PTS = 10000 csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # try 5 times, to see if all inits by h2o are good for trial in range(5): kwargs = {'k': CLUSTERS, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'syn_spheres100.hex'} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] clustersSorted = sorted(clusters, key=itemgetter(0)) ### print clustersSorted print "\nh2o result, centers sorted" print clustersSorted print "\ngenerated centers" print centersList for i,center in enumerate(centersList): a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" x not correct.") self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" y not correct.") self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" z not correct.") print "Trial #", trial, "completed"
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers for i in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) model_key = kmeans['destination_key'] kmeansResult = h2o_cmd.runInspect(key=model_key) centers = kmeansResult['KMeansModel']['clusters'] h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) show_results(csvPathname, parseKey, model_key, centers, 'd')
def test_KMeans_constant_col(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: print "Generate synthetic dataset with first column constant = 0 and see what KMeans does" cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey[ 'destination_key'] kwargs = { 'k': 2, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex' } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) # check center list (first center) has same number of cols as source data self.assertEqual( colCount, len(centers[0]), "kmeans first center doesn't have same # of values as dataset row %s %s" % (colCount, len(centers[0])))
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = { 'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex' } timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='spheres3.hex') ### print h2o.dump_json(kmeans) print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) clusters = kmeansResult['KMeansModel']['clusters'] # cluster centers can return in any order clustersSorted = sorted(clusters, key=itemgetter(0)) self.assertAlmostEqual(clustersSorted[0][0], 100, delta=.2) self.assertAlmostEqual(clustersSorted[1][0], 200, delta=.2) self.assertAlmostEqual(clustersSorted[2][0], 300, delta=.2) self.assertAlmostEqual(clustersSorted[0][1], 100, delta=.2) self.assertAlmostEqual(clustersSorted[1][1], 200, delta=.2) self.assertAlmostEqual(clustersSorted[2][1], 300, delta=.2) self.assertAlmostEqual(clustersSorted[0][2], 100, delta=.2) self.assertAlmostEqual(clustersSorted[1][2], 200, delta=.2) self.assertAlmostEqual(clustersSorted[2][2], 300, delta=.2)
def test_KMeans_winesPCA(self): csvPathname = h2o.find_file('smalldata/winesPCA.csv') start = time.time() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'initialization': 'Furthest', 'k': 3, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } timeoutSecs = 480 # try the same thing 5 times for trial in range (10): start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = \ h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster # now compare expected vs actual. By sorting on center, we should be able to compare # since the centers should be separated enough to have the order be consistent expected = [ ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794) , ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745) , ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474) , ] # multipliers on the expected values for allowed allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def test_KMeans_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { 'cols': None, 'epsilon': 1e-4, 'k': 2, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 800), ] else: csvFilenameList = [ ('covtype20x.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = {'k': 1, 'destination_key': csvFilename + "_" + str(trial) + '.hex'} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_twit(self): csvFilename = "Twitter2DB.txt" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/' + csvFilename) # h2b.browseTheCloud() # parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", separator=9) # force tab sep parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers # should check the means? # FIX! have to fix these to right answers expected = [ # expected centers are from R. rest is just from h2o ([310527.2, 13433.89], 11340, None), ([5647967.1, 40487.76], 550, None), ([21765291.7, 93129.26], 14, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'max_iter': 50, 'epsilon': 1e-4, 'normalize': 0, 'cols': '0,1', 'initialization': 'Furthest', # 'initialization': 'PlusPlus', 'destination_key': 'kmeans_dest_key', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) if 1==0: h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore") h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply") h2b.browseJsonHistoryAsUrlLastMatch("KMeans") time.sleep(3600) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str,range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) expected = [ ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], num_rows, None) ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) model_key = kmeans['destination_key'] kmeansResult = h2o_cmd.runInspect(key=model_key) ## h2o.nodes[0].kmeans_apply(data_key=parseKey['destination_key'], model_key=model_key, destination_key='a') # this is failing for some reason ## h2o.nodes[0].kmeans_score(key=parseKey['destination_key'], model_key=model_key) clusters = kmeansResult['KMeansModel']['clusters'] for i,c in enumerate(clusters): print "clusters["+str(i)+"]: ", clusters[i] ## print h2o.dump_json(kmeans) ## print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex' } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) kmeansResult = h2o_cmd.runInspect(key='prostate_k.hex') print h2o.dump_json(kmeans) print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
def test_B_kmeans_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='put', hex_key=csvFilename+".hex") expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = {'k': 3, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='put', hex_key=csvFilename+".hex") # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = {'k': 3, 'initialization': 'Furthest', 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_KMeans_allstate_s3n_thru_hdfs(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'allstate' csvFilename = "train_set.csv" csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 600 trialMax = 3 for trial in range(trialMax): trialStart = time.time() hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print 'h2o reported parse time:', parseResult['response']['time'] print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { 'cols': None, 'initialization': 'Furthest', 'k': 12 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex") for trial in range(10): # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = { 'k': 3, 'initialization': 'Furthest', 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex', # 'seed': 265211114317615310, 'seed': 0, } timeoutSecs = 90 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [ ([100, 100, 100], 1000000, 60028168), ([200, 200, 200], 2000000, 479913618), ([300, 300, 300], 3000000, 1619244994), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117), ([63.93984962406015], 133, 611.5187969924812), ([71.55307262569832], 179, 1474.2458100558654), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'initialization': 'Furthest', 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_four_billion_rows(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/billions" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 1500 csvFilenameAll = [ "four_billion_rows.csv", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() # Parse********************************* parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] value_size_bytes = inspect['value_size_bytes'] row_size = inspect['row_size'] print "\n" + csvFilename, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) summaryResult = h2o_cmd.runSummary(key=parseKey['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (2, num_cols)) self.assertEqual(4 * 1000000000, num_rows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, num_rows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'epsilon': 1e-6, 'max_iter': 20, 'cols': None, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1 } # one coefficient is checked a little more colX = 0 # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
def test_KMeans_allstate_s3n_thru_hdfs(self): # csvFilename = "covtype20x.data" # csvPathname = csvFilename csvFilename = "train_set.csv" csvPathname = "allstate/" + csvFilename # https://s3.amazonaws.com/home-0xdiag-datasets/allstate/train_set.csv URI = "s3n://home-0xdiag-datasets/" s3nKey = URI + csvPathname trialMax = 3 for trial in range(trialMax): trialStart = time.time() # since we delete the key, we have to re-import every iteration # s3n URI thru HDFS is not typical. importHDFSResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importHDFSResult["succeeded"] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList), 8, "Didn't see more than 8 files in s3n?") storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) for s in storeView["keys"]: print "\nkey:", s["key"] if "rows" in s: print "rows:", s["rows"], "value_size_bytes:", s["value_size_bytes"] key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, "thru HDFS" # ec2 is about 400 secs on four m2.4xlarge nodes # should be less on more nodes? timeoutSecs = 600 start = time.time() parseKey = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noise=("JStack", None) ) elapsed = time.time() - start print s3nKey, "h2o reported parse time:", parseKey["response"]["time"] print "parse end on ", s3nKey, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseKey["destination_key"] kwargs = {"cols": None, "epsilon": 1e-6, "k": 12} start = time.time() kmeans = h2o_cmd.runKMeansOnly( parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs ) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans["destination_key"]) print h2o.dump_json(inspect) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # h2o removes key after parse now ### print "Removing", s3nKey ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey) ### print "removeKeyResult:", h2o.dump_json(removeKeyResult) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.",
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname2, key2=csvFilename2 + ".hex") ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(10): kwargs = { 'k': CLUSTERS, 'initialization': 'Furthest', 'cols': cols, 'destination_key': 'syn_spheres100.hex' } timeoutSecs = 100 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') # print h2o.dump_json(kmeansResult) ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] # the way we create the centers above, if we sort on the sum of xyz # we should get the order the same as when they were created. # to be safe, we'll sort the centers that were generated too, the same way clustersSorted = sorted(clusters, key=sum) centersSorted = sorted(centersList, key=sum) ### print clustersSorted print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)" cf = '{0:6.2f}' for c in clustersSorted: print ' '.join(map(cf.format,c)) print "\ngenerated centers (sorted by key=sum)" for c in centersSorted: print ' '.join(map(cf.format,c)) for i,center in enumerate(centersSorted): # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem. # Assuming that the difference between adjacent sums of all center values, # is greater than 2x the sum of all max allowed variance on each value, # Then the sums will be unique and non-overlapping with allowed variance. # So a sort of the centers, keyed on sum of all values for a center. # will create an ordering that can be compared. # sort gen'ed and actual separately. # Adjacent center hamming distance check is done during gen above. a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) for i, v in enumerate(a): emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct." self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg) print "Trial #", trial, "completed"
def test_KMeans_allstate_s3n_thru_hdfs(self): csvFilename = "CAT*" URI = "s3n://home-0xdiag-datasets/cats" s3nKey = URI + "/" + csvFilename trialMax = 1 for trial in range(trialMax): trialStart = time.time() # since we delete the key, we have to re-import every iteration # s3n URI thru HDFS is not typical. importHDFSResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList),1,"Didn't see more than 1 files in s3n?") storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) for s in storeView['keys']: print "\nkey:", s['key'] if 'rows' in s: print "rows:", s['rows'], "value_size_bytes:", s['value_size_bytes'] key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' # ec2 is about 400 secs on four m2.4xlarge nodes # should be less on more nodes? timeoutSecs = 600 start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noise=('JStack', None)) elapsed = time.time() - start print s3nKey, 'h2o reported parse time:', parseKey['response']['time'] print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] kwargs = { 'cols': None, 'epsilon': 1e-6, 'k': 12 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() # pattern matching problem # h2o removes key afte parse now ### print "Removing", s3nKey ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_KMeans_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "/datasets/kmeans_big" csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. if FROM_HDFS: importFolderResult = h2i.setupImportHdfs( None, importFolderPath) else: importFolderResult = h2i.setupImportFolder( None, importFolderPath) # PARSE **************************************** print "Parse starting: " + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseKey = h2i.parseImportHdfsFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) # KMeans **************************************** print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'initialization': 'Furthest', 'epsilon': 1e-6, 'cols': None, 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial % 3) == 0: kwargs['initialization'] = 'PlusPlus' elif (trial % 3) == 1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
def test_B_kmeans_benign(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "benign.csv" key2 = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right? parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename expected = [ ([ 24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117 ], 77, 46889.32010560476), ([ 25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282 ], 114, 64011.20272144667), ([ 30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667 ], 12, 13000.485226507595), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = { 'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_kmeans_sphere5(self): SYNDATASETS_DIR = h2o.make_syn_dir() CLUSTERS = 5 SPHERE_PTS = 10000 csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # try 5 times, to see if all inits by h2o are good for trial in range(5): kwargs = { 'k': CLUSTERS, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'syn_spheres100.hex' } timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] clustersSorted = sorted(clusters, key=itemgetter(0)) ### print clustersSorted print "\nh2o result, centers sorted" print clustersSorted print "\ngenerated centers" print centersList for i, center in enumerate(centersList): a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str, a)) bStr = ",".join(map(str, b)) iStr = str(i) self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " x not correct.") self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " y not correct.") self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " z not correct.") print "Trial #", trial, "completed"
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname2, key2=csvFilename2 + ".hex") ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(10): kwargs = { 'k': CLUSTERS, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'syn_spheres100.hex' } timeoutSecs = 100 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') # print h2o.dump_json(kmeansResult) ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] # the way we create the centers above, if we sort on the sum of xyz # we should get the order the same as when they were created. # to be safe, we'll sort the centers that were generated too, the same way clustersSorted = sorted(clusters, key=sum) centersSorted = sorted(centersList, key=sum) ### print clustersSorted print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)" cf = '{0:6.2f}' for c in clustersSorted: print ' '.join(map(cf.format,c)) print "\ngenerated centers (sorted by key=sum)" for c in centersSorted: print ' '.join(map(cf.format,c)) for i,center in enumerate(centersSorted): # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem. # Assuming that the difference between adjacent sums of all center values, # is greater than 2x the sum of all max allowed variance on each value, # Then the sums will be unique and non-overlapping with allowed variance. # So a sort of the centers, keyed on sum of all values for a center. # will create an ordering that can be compared. # sort gen'ed and actual separately. # Adjacent center hamming distance check is done during gen above. a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) for i, v in enumerate(a): emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct." self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg) print "Trial #", trial, "completed"
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = { 'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "/datasets/kmeans_big" csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. if FROM_HDFS: importFolderResult = h2i.setupImportHdfs(None, importFolderPath) else: importFolderResult = h2i.setupImportFolder(None, importFolderPath) # PARSE **************************************** print "Parse starting: " + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseKey = h2i.parseImportHdfsFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # KMeans **************************************** print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'initialization': 'Furthest', 'epsilon': 1e-6, 'cols': None, 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial%3)==0: kwargs['initialization'] = 'PlusPlus' elif (trial%3)==1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
def test_KMeans_allstate_s3n_thru_hdfs(self): # csvFilename = "covtype20x.data" # csvPathname = csvFilename csvFilename = "CAT*" csvPathname = "cats/" + csvFilename # https://s3.amazonaws.com/home-0xdiag-datasets/allstate/train_set.csv URI = "s3n://home-0xdiag-datasets/" s3nKey = URI + csvPathname trialMax = 1 for trial in range(trialMax): trialStart = time.time() # since we delete the key, we have to re-import every iteration # s3n URI thru HDFS is not typical. importHDFSResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList), 8, "Didn't see more than 8 files in s3n?") storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) for s in storeView['keys']: print "\nkey:", s['key'] if 'rows' in s: print "rows:", s['rows'], "value_size_bytes:", s[ 'value_size_bytes'] key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' # ec2 is about 400 secs on four m2.4xlarge nodes # should be less on more nodes? timeoutSecs = 600 start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noise=('JStack', None)) elapsed = time.time() - start print s3nKey, 'h2o reported parse time:', parseKey['response'][ 'time'] print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] kwargs = {'cols': None, 'epsilon': 1e-6, 'k': 12} start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() # pattern matching problem # h2o removes key afte parse now ### print "Removing", s3nKey ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_four_billion_rows(self): timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ "four_billion_rows.csv", ] for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local' timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] value_size_bytes = inspect['value_size_bytes'] row_size = inspect['row_size'] print "\n" + csvFilename, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(2, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (2, num_cols)) self.assertEqual(4*1000000000, num_rows, msg="generated %s rows, parsed to %s rows" % (4*1000000000, num_rows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'epsilon': 1e-6, 'max_iter': 20, 'cols': None, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1} # one coefficient is checked a little more colX = 0 # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
def test_parse_bounds_libsvm(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 1), # FIX! fails KMeansScore ("tmc2007_train.svm", "cJ", 30, 1), ("covtype.binary.svm", "cC", 30, 1), ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 30, 1), ("mushrooms.svm", "cG", 30, 1), ("news20.svm", "cH", 30, 1), ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvPathname, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) # KMEANS****************************************** for trial in range(2): kwargs = { 'k': 3, 'epsilon': 1e-6, # 'cols': 2, # 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseKey['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)