def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_spheres3_" + str(SEED) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = {"k": 3, "epsilon": 1e-6, "cols": None, "destination_key": "spheres3.hex", "seed": 265211114317615310} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, "d", **kwargs) expected = [ ([100, 100, 100], 1000000, 60028168), ([200, 200, 200], 2000000, 479913618), ([300, 300, 300], 3000000, 1619244994), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex', 'seed': 265211114317615310} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) expected = [ ([100, 100, 100], 1000000, 60028168), ([200, 200, 200], 2000000, 479913618), ([300, 300, 300], 3000000, 1619244994), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
def test_C_kmeans_prostate(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "prostate.csv" key2 = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_B_kmeans_benign(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "benign.csv" key2 = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right? parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def kmeans_doit(self, csvFilename, bucket, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=10 ) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun kwargs = { "k": 1, "initialization": "Furthest", "destination_key": "KMeansModel.hex", "max_iter": 25, # reuse the same seed, to get deterministic results (otherwise sometimes fails "seed": 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans( parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs ) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, "d", **kwargs) expected = [ ( [ -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154, ], num_rows, None, ) ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans["destination_key"]) KMeansModel = inspect["KMeansModel"] clusters = KMeansModel["centers"][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_B_kmeans_benign(self): h2o.beta_features = True csvPathname = "logreg" csvFilename = "benign.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=csvFilename+".hex", noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): params = {'k': 3, 'initialization': 'Furthest', 'ignored_cols' : None, 'destination_key': 'benign_k.hex', 'max_iter': 50, 'seed': 265211114317615310, } kwargs = params.copy() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex") h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): params = {'k': 3, 'initialization': 'Furthest', 'ignored_cols': "ID", 'destination_key': 'prostate_k.hex', 'max_iter': 100, 'seed': 265211114317615310 } kwargs = params.copy() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex") # loop, to see if we get same centers # this was sklearn.cluster.Kmeans with first col removed. num_rows and error is 0 here expected = [ ([0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045), ([0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36956), ([0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412), ] # all are multipliers of expected tuple value allowedDelta = (0.1, 0.1, 0.1) # try saving best! bestError = None for trial in range(10): seed = random.randint(0, sys.maxint) seed = 7509839924844349324 if h2o.beta_features: params = {'k': 3, # 'initialization': 'Furthest', 'initialization': 'PlusPlus', 'ignored_cols': "ID", 'destination_key': 'prostate_k.hex', 'max_iter': 1000, 'seed': seed } else: params = {'k': 3, # 'initialization': 'Furthest', 'initialization': 'PlusPlus', 'cols': 'CAPSULE, AGE, RACE, DPROS, DCAPS, PSA, VOL, GLEASON', 'destination_key': 'prostate_k.hex', 'max_iter': 100, 'seed': seed } kwargs = params.copy() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) error = kmeans['model']['error'] if not bestError or error < bestError: print 'Found smaller error:', error bestError = error bestCenters = centers bestSeed = seed bestTrial = trial print "bestTrial:", bestTrial print "bestError:", bestError print "bestCenters:", bestCenters print "bestSeed:", bestSeed
def test_kmeans_prostate(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36955), ([0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045), ([0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(1): # kmeansSeed = random.randint(0, sys.maxint) # actually can get a slightly better error sum with a different seed # this seed gets the same result as scikit kmeansSeed = 6655548259421773879 kwargs = { "ignored_cols": "ID", "k": 3, # 'initialization': 'Furthest', "initialization": "PlusPlus", "destination_key": "prostate_k.hex", "max_iter": 500, "seed": kmeansSeed, # reuse the same seed, to get deterministic results (otherwise sometimes fails # 'seed': 265211114317615310} } # for fvec only? kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) # FIX! how do I get the kmeans result? ### print "kmeans result:", h2o.dump_json(kmeans) # can't do this # inspect = h2o_cmd.runInspect(key='prostate_k.hex') modelView = h2o.nodes[0].kmeans_view(model="prostate_k.hex") h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView["model"] clusters = model["centers"] within_cluster_variances = model["within_cluster_variances"] total_within_SS = model["total_within_SS"] print "within_cluster_variances:", within_cluster_variances print "total_within_SS:", total_within_SS (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, "d", **kwargs ) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_KMeans2_winesPCA(self): h2o.beta_features = True csvPathname = 'winesPCA.csv' start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) kwargs = { 'initialization': 'Furthest', # 'initialization': '', # 'initialization': 'PlusPlus', 'max_iter': 50, 'k': 3, 'seed': '265211114317615310', } timeoutSecs = 480 # try the same thing 5 times for trial in range (10): start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = \ h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster # now compare expected vs actual. By sorting on center, we should be able to compare # since the centers should be separated enough to have the order be consistent if OLD_KMEANS: expected = [ ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794), ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745), ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474), ] else: # error: 258.051462872 expected = [ ([-2.23406681758209, -0.7729819755373136], 67, 96.85372611195429), ([0.25174392601612905, 1.792222172419355], 62, 99.21823733913352), ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474), ] # multipliers on the expected values for allowed # within 2% of best with random seeds? allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def test_KMeans_twit_fvec(self): h2o.beta_features = True csvFilename = "Twitter2DB.txt" print "\nStarting", csvFilename # h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=csvFilename + ".hex", schema='put') # both of these centers match what different R/Scikit packages get expected1 = [ # expected centers are from R. rest is just from h2o ([310527.2, 13433.89], 11340, None), ([5647967.1, 40487.76], 550, None), ([21765291.7, 93129.26], 14, None), ] # this is what we get with Furthest expected2 = [ ([351104.74065255735, 15421.749823633158], 11340, 5021682274541967.0), ([7292636.589090909, 7575.630909090909], 550, 6373072701775582.0), ([34406781.071428575, 244878.0], 14, 123310713697348.92), ] # all are multipliers of expected tuple value allowedDelta = (0.0001, 0.0001, 0.0001) for trial in range(2): kwargs = { 'k': 3, 'max_iter': 50, 'normalize': 0, 'initialization': 'Furthest', # 'initialization': 'PlusPlus', 'destination_key': 'kmeans_dest_key', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } init_choices = ['Furthest', 'PlusPlus'] kwargs['initialization'] = init_choices[trial % len(init_choices)] kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) # can't inspect a kmeans2 model? # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key'], verbose=True) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected2, allowedDelta, trial=trial)
def test_KMeans_winesPCA(self): csvPathname = 'winesPCA.csv' start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10) print "parse end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'initialization': 'Furthest', 'k': 3, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } timeoutSecs = 480 # try the same thing 5 times for trial in range(10): start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = \ h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster # now compare expected vs actual. By sorting on center, we should be able to compare # since the centers should be separated enough to have the order be consistent expected = [ ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794), ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745), ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474), ] # multipliers on the expected values for allowed allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def test_B_kmeans_benign(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right? print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180, noPoll=not DO_POLL, doSummary=False) if not DO_POLL: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) parseResult['destination_key'] = hex_key inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename expected = [ ([10.5, 2.8, 40.3, 0.0, 12.0, 0.8, 1.6, 21.1, 11.4, 0.7, 2.9, 206.2, 36.7, 1.5], 15, 0) , ([23.72897196261682, 2.3271028037383177, 44.81308411214953, 0.34579439252336447, 13.093457943925234, 1.4579439252336448, 1.3177570093457944, 24.16129367150993, 13.317757009345794, 0.5071931108136043, 2.6604011393039024, 121.6822429906542, 40.13084112149533, 1.691588785046729], 110, 0) , ([29.2625, 2.7, 48.5125, 0.1625, 12.0625, 1.0375, 1.4875, 23.023665714263917, 12.6875, 0.5073033705353737, 3.090870788693428, 160.95, 43.3, 1.65], 71, 0) , ([38.333333333333336, 2.3333333333333335, 52.666666666666664, 0.0, 14.333333333333334, 2.3333333333333335, 1.6666666666666667, 25.85955047607422, 12.0, 0.5056179761886597, 3.2846442063649497, 261.6666666666667, 43.0, 1.0], 4, 0) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01, 0.01) # loop, to see if we get same centers if DO_IGNORE: kwargs = {'k': 4, 'ignored_cols': 'STR', 'destination_key': 'benign_k.hex', 'seed': 265211114317615310, 'max_iter': 50} else: kwargs = {'k': 4, 'ignored_cols': None, 'destination_key': 'benign_k.hex', 'seed': 265211114317615310, 'max_iter': 50} kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=not DO_POLL, **kwargs) if not DO_POLL: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # hack..supposed to be there like va kmeans['destination_key'] = 'benign_k.hex' ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans)) modelView = h2o.nodes[0].kmeans_model_view(model='benign_k.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['clusters'] cluster_variances = model['cluster_variances'] error = model['error'] print "cluster_variances:", cluster_variances print "error:", error # make this fvec legal? (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
def test_B_kmeans_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") expected = [ ([ 24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117 ], 77, 46889.32010560476), ([ 25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282 ], 114, 64011.20272144667), ([ 30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667 ], 12, 13000.485226507595), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = { 'k': 3, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_kmeans_prostate(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36955), ([0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045), ([0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(1): # kmeansSeed = random.randint(0, sys.maxint) # actually can get a slightly better error sum with a different seed # this seed gets the same result as scikit kmeansSeed = 6655548259421773879 kwargs = { 'ignored_cols': 'ID', 'k': 3, # 'initialization': 'Furthest', 'initialization': 'PlusPlus', 'destination_key': 'prostate_k.hex', 'max_iter': 500, 'seed': kmeansSeed, # reuse the same seed, to get deterministic results (otherwise sometimes fails # 'seed': 265211114317615310} } # for fvec only? kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_kmeans_sphere3_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) for trial in range(10): # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = { 'k': 3, 'max_iter': 25, 'initialization': 'Furthest', 'destination_key': 'spheres3.hex', # 'seed': 265211114317615310, 'seed': 0, } timeoutSecs = 90 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [ ([100, 100, 100], 1000000, 60028168), ([200, 200, 200], 2000000, 479913618), ([300, 300, 300], 3000000, 1619244994), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) gs = h2o.nodes[0].gap_statistic(source=hex_key, k_max=5, timeoutSecs=300) print "gap_statistic:", h2o.dump_json(gs)
def test_C_kmeans_prostate(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([43.07058823529412, 0.36470588235294116, 67.70588235294117, 1.1058823529411765, 2.3529411764705883, 1.2117647058823529, 17.33529411764706, 14.201176470588232, 6.588235294117647], 103, 0) , ([166.04347826086956, 0.4658385093167702, 66.09316770186335, 1.0807453416149069, 2.3043478260869565, 1.0807453416149069, 15.0632298136646, 16.211118012422357, 6.527950310559007], 136, 0) , ([313.4029850746269, 0.35074626865671643, 64.91791044776119, 1.0820895522388059, 2.1791044776119404, 1.0746268656716418, 14.601492537313437, 16.35686567164179, 6.082089552238806], 141, 0) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) kwargs = {'k': 3, 'initialization': 'Furthest', 'destination_key': 'prostate_k.hex', 'max_iter': 50, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} # for fvec only? kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=not DO_POLL, **kwargs) if not DO_POLL: h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # hack..supposed to be there like va kmeans['destination_key'] = 'prostate_k.hex' # FIX! how do I get the kmeans result? ### print "kmeans result:", h2o.dump_json(kmeans) # can't do this # inspect = h2o_cmd.runInspect(key='prostate_k.hex') modelView = h2o.nodes[0].kmeans_model_view(model='prostate_k.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['clusters'] cluster_variances = model['cluster_variances'] error = model['error'] print "cluster_variances:", cluster_variances print "error:", error # variance of 0 might be legal with duplicated rows. wasn't able to remove the duplicate rows of NAs at # bottom of benign.csv in ec2 # for i,c in enumerate(cluster_variances): # if c < 0.1: # raise Exception("cluster_variance %s for cluster %s is too small. Doesn't make sense. Ladies and gentlemen, this is Chewbacca. Chewbacca is a Wookiee from the planet Kashyyyk. But Chewbacca lives on the planet Endor. Now think about it...that does not make sense!" % (c, i)) # make this fvec legal? (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
def test_KMeans_twit(self): csvFilename = "Twitter2DB.txt" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/' + csvFilename) # h2b.browseTheCloud() # parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", separator=9) # force tab sep parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers # should check the means? # FIX! have to fix these to right answers expected = [ # expected centers are from R. rest is just from h2o ([310527.2, 13433.89], 11340, None), ([5647967.1, 40487.76], 550, None), ([21765291.7, 93129.26], 14, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'max_iter': 50, 'epsilon': 1e-4, 'normalize': 0, 'cols': '0,1', 'initialization': 'Furthest', # 'initialization': 'PlusPlus', 'destination_key': 'kmeans_dest_key', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) if 1 == 0: h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore") h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply") h2b.browseJsonHistoryAsUrlLastMatch("KMeans") time.sleep(3600) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_KMeans_twit(self): csvFilename = "Twitter2DB.txt" print "\nStarting", csvFilename # h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=csvFilename + ".hex", schema='put') # both of these centers match what different R/Scikit packages get expected1 = [ # expected centers are from R. rest is just from h2o ([310527.2, 13433.89], 11340, None), ([5647967.1, 40487.76], 550, None), ([21765291.7, 93129.26], 14, None), ] # this is what we get with Furthest expected2 = [ ([351104.74065255735, 15421.749823633158], 11340, 5021682274541967.0) , ([7292636.589090909, 7575.630909090909], 550, 6373072701775582.0) , ([34406781.071428575, 244878.0], 14, 123310713697348.92) , ] # all are multipliers of expected tuple value allowedDelta = (0.0001, 0.0001, 0.0001) for trial in range(2): kwargs = { 'k': 3, 'max_iter': 50, 'normalize': 0, 'cols': '0,1', 'initialization': 'Furthest', # 'initialization': 'PlusPlus', 'destination_key': 'kmeans_dest_key', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } init_choices = ['Furthest', 'PlusPlus'] kwargs['initialization'] = init_choices[trial % len(init_choices)] kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'], verbose=True) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) if 1==0: h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore") h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply") h2b.browseJsonHistoryAsUrlLastMatch("KMeans") # Comment sleep out to get a clean grep. # time.sleep(3600) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected2, allowedDelta, trial=trial)
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str, range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) expected = [([ -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154 ], num_rows, None)] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_kmeans_iris_fvec(self): h2o.beta_features = True csvFilename = 'iris.csv' csvPathname = 'iris/' + csvFilename print "\nStarting", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key) k = 3 ignored_cols = 'C5' for trial in range(3): # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = { 'ignored_cols': ignored_cols, # ignore the output 'k': k, 'max_iter': 25, 'initialization': 'Furthest', 'destination_key': 'iris.hex', 'seed': 0, } timeoutSecs = 90 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [ # if ignored_cols isn't used # ([5, 3.4, 1.46, 0.244, 0.0], 50, 15.24) , # ([5.9, 2.76, 4.26, 1.33, 1.02], 51, 32.9) , # ([6.6, 2.98, 5.57, 2.03, 2.0], 49, 39.15) , ([5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999], 50, 15.240400000000003) , ([5.901612903225807, 2.748387096774194, 4.393548387096775, 1.4338709677419357], 62, 39.82096774193549) , ([6.8500000000000005, 3.073684210526315, 5.742105263157894, 2.0710526315789473], 38, 23.87947368421053) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) gs = h2o.nodes[0].gap_statistic(source=hex_key, ignored_cols=ignored_cols, k_max=k) print "gap_statistic:", h2o.dump_json(gs) k_best = gs['gap_model']['k_best'] self.assertTrue(k_best!=0, msg="k_best shouldn't be 0: %s" % k_best)
def test_C_kmeans_prostate(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "prostate.csv" key2 = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117), ([63.93984962406015], 133, 611.5187969924812), ([71.55307262569832], 179, 1474.2458100558654), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'initialization': 'Furthest', 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_KMeans_winesPCA(self): csvPathname = h2o.find_file('smalldata/winesPCA.csv') start = time.time() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'initialization': 'Furthest', 'k': 3, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } timeoutSecs = 480 # try the same thing 5 times for trial in range (10): start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = \ h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster # now compare expected vs actual. By sorting on center, we should be able to compare # since the centers should be separated enough to have the order be consistent expected = [ ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794) , ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745) , ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474) , ] # multipliers on the expected values for allowed allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def kmeans_doit(self, csvFilename, bucket, csvPathname, numRows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=20) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun kwargs = { 'k': 1, 'initialization': 'Furthest', 'destination_key': 'KMeansModel.hex', 'max_iter': 25, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [ ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], numRows, None) ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? # inspect doesn't work # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key']) # KMeansModel = inspect['KMeansModel'] modelView = h2o.nodes[0].kmeans_view(model='KMeansModel.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['centers'] within_cluster_variances = model['within_cluster_variances'] total_within_SS = model['total_within_SS'] print "within_cluster_variances:", within_cluster_variances print "total_within_SS:", total_within_SS if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_kmeans_sphere3_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_spheres3_" + str(SEED) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=hex_key) for trial in range(10): # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = { "k": 3, "max_iter": 25, "initialization": "Furthest", "destination_key": "spheres3.hex", # 'seed': 265211114317615310, "seed": 0, } timeoutSecs = 90 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, "d", **kwargs ) expected = [ ([100, 100, 100], 1000000, 60028168), ([200, 200, 200], 2000000, 479913618), ([300, 300, 300], 3000000, 1619244994), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) gs = h2o.nodes[0].gap_statistic(source=hex_key, k_max=5, timeoutSecs=300) print "gap_statistic:", h2o.dump_json(gs)
def test_KMeans_twit(self): csvFilename = "Twitter2DB.txt" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/' + csvFilename) # h2b.browseTheCloud() # parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", separator=9) # force tab sep parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers # should check the means? # FIX! have to fix these to right answers expected = [ # expected centers are from R. rest is just from h2o ([310527.2, 13433.89], 11340, None), ([5647967.1, 40487.76], 550, None), ([21765291.7, 93129.26], 14, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'max_iter': 50, 'epsilon': 1e-4, 'normalize': 0, 'cols': '0,1', 'initialization': 'Furthest', # 'initialization': 'PlusPlus', 'destination_key': 'kmeans_dest_key', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) if 1==0: h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore") h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply") h2b.browseJsonHistoryAsUrlLastMatch("KMeans") time.sleep(3600) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_kmeans_prostate(self): h2o.beta_features = True # fvec importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\nStarting", csvFilename # loop, to see if we get same centers expected = [ ([0.37,65.77,1.07,2.23,1.11,10.49,4.24,6.31], 215, 36955), ([0.36,66.44,1.09,2.21,1.06,10.84,34.16,6.31], 136, 46045), ([0.83,66.17,1.21,2.86,1.34,73.30,15.57,7.31], 29, 33412), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(1): # kmeansSeed = random.randint(0, sys.maxint) # actually can get a slightly better error sum with a different seed # this seed gets the same result as scikit kmeansSeed = 6655548259421773879 kwargs = { 'ignored_cols': 'ID', 'k': 3, # 'initialization': 'Furthest', 'initialization': 'PlusPlus', 'destination_key': 'prostate_k.hex', 'max_iter': 500, 'seed': kmeansSeed, # reuse the same seed, to get deterministic results (otherwise sometimes fails # 'seed': 265211114317615310} } # for fvec only? kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str,range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) expected = [ ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], num_rows, None) ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def notest_B_kmeans_benign(self): h2o.beta_features = True csvPathname = "logreg" csvFilename = "benign.csv" hex_key = csvFilename + ".hex" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=hex_key) # FIX! have to fill in expected rows and error here # this is from sklearn.cluster.KMeans, with NA's converted to 0 expected = [ ([ 8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], None, None), ([33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], None, None), ([27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], None, None), ([26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], None, None), ] for i in range(14): execExpr = '%s[,%s] = is.na(%s[,%s]) ? 0.0 : %s[,%s]' % (hex_key,i+1,hex_key,i+1,hex_key,i+1) h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=4) # all are multipliers of expected tuple value allowedDelta = (0.1, 0.1, 0.1) # loop, to see if we get same centers for trial in range(2): params = {'k': 4, # 'initialization': 'Furthest', 'initialization': 'PlusPlus', 'destination_key': 'benign_k.hex', 'max_iter': 100, 'seed': 265211114317615310, } kwargs = params.copy() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_kmeans_sphere3_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) for trial in range(10): # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = { 'k': 3, 'max_iter': 25, 'initialization': 'Furthest', 'destination_key': 'spheres3.hex', # 'seed': 265211114317615310, 'seed': 0, } timeoutSecs = 90 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [ ([100, 100, 100], 1000000, 60028168), ([200, 200, 200], 2000000, 479913618), ([300, 300, 300], 3000000, 1619244994), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='put', hex_key=csvFilename+".hex") # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = {'k': 3, 'initialization': 'Furthest', 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117), ([63.93984962406015], 133, 611.5187969924812), ([71.55307262569832], 179, 1474.2458100558654), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'initialization': 'Furthest', 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers expected = [ ([55.63235294117647], 68, 667.8088235294117) , ([63.93984962406015], 133, 611.5187969924812) , ([71.55307262569832], 179, 1474.2458100558654) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_B_kmeans_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='put', hex_key=csvFilename+".hex") expected = [ ([23.10144927536232, 2.4927536231884058, 48.0, 0.21739130434782608, 12.565217391304348, 1.2028985507246377, 1.4057971014492754, 23.116674808663088, 12.826086956521738, 0.5451880801172447, 2.9851815665201102, 146.0144927536232, 42.84057971014493, 1.8985507246376812], 69, 32591.363626134153) , ([25.68421052631579, 3.0526315789473686, 46.5, 0.02631578947368421, 12.236842105263158, 1.105263157894737, 1.5789473684210527, 22.387788290952102, 12.105263157894736, 0.5934358367829686, 2.9358367829686576, 184.5, 41.026315789473685, 1.5263157894736843], 38, 21419.904448700647) , ([26.943181818181817, 2.272727272727273, 44.51136363636363, 0.38636363636363635, 12.840909090909092, 1.3636363636363635, 1.3181818181818181, 24.40187691521961, 13.477272727272727, 0.4736976506639427, 2.7090143003064355, 118.14772727272727, 40.13636363636363, 1.5568181818181819], 88, 44285.07981193549) , ([31.8, 2.4, 48.2, 0.0, 13.4, 1.8, 1.6, 24.51573033707865, 11.8, 0.3033707865168539, 2.9707865168539325, 252.0, 41.4, 1.0], 5, 2818.6716828683248) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): # 3 clusters wasn't stable? try 4 (3 wasn't stable in sklearn either) kwargs = {'k': 4, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex', 'max_iter': 50, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_B_kmeans_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") expected = [ ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) , ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) , ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = {'k': 3, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='put', hex_key=csvFilename+".hex") # loop, to see if we get same centers expected = [ ([63.0, 0.384, 67.696, 1.088, 2.32, 1.168, 16.680799999999998, 15.549599999999995, 6.664], 125, 267074.35439999995) , ([188.5, 0.47619047619047616, 65.26190476190476, 1.1031746031746033, 2.3174603174603177, 1.0873015873015872, 16.17603174603175, 15.611825396825397, 6.436507936507937], 126, 257264.28207380953) , ([316.0, 0.3488372093023256, 65.1937984496124, 1.069767441860465, 2.1782945736434107, 1.069767441860465, 13.426356589147293, 16.264496124031005, 6.062015503875969], 129, 278906.3168310078) , ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = {'k': 3, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'prostate_k.hex', 'max_iter': 50, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310} kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_c5_KMeans_sphere_26GB_fvec(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' csvFilename = 'syn_sphere15_gen_26GB.csv' # csvFilename = 'syn_sphere_gen_h1m.csv' # csvFilename = 'syn_sphere_gen_real_1.49M.csv' # csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? if NA_COL_BUG: expected = [ # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB, # so shouldn't be used for 26GB # or it should be divided by 7 # the distribution is the same, obviously. ([ -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] else: expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse( path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs) else: parseResult = h2i.import_parse( path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, doSummary=False, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=300) numRows = inspect['numRows'] numCols = inspect['numCols'] summary = h2o_cmd.runSummary(key=parseResult['destination_key'], numRows=numRows, numCols=numCols, timeoutSecs=300) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 500, # 'normalize': 1, 'normalize': 0, # temp try 'initialization': 'Furthest', 'destination_key': 'junk.hex', # we get NaNs if whole col is NA 'ignored_cols': 'C1', 'normalize': 0, # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial % 3) == 0: kwargs['initialization'] = 'PlusPlus' elif (trial % 3) == 1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeansResult) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) # his does predict (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeansResult, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # these clusters were sorted compared to the cluster order in training h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial) # why is the expected # of rows not right in KMeans2. That means predictions are wrong h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial) # the tupleResultList has the size during predict? compare it to the sizes during training # I assume they're in the same order. model = kmeansResult['model'] size = model['size'] size2 = [t[1] for t in tupleResultList] if 1 == 1: # debug print "training size:", size print "predict size2:", size2 print "training sorted(size):", sorted(size) print "predict sorted(size2):", sorted(size2) print h2o.nodes[0].http_addr print h2o.nodes[0].port clusters = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] print "iterations", iterations if iterations >= ( max_iter - 1): # h2o hits the limit at max_iter-1..shouldn't hit it raise Exception( "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s", (trial, iterations, max_iter)) # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure # can't do this compare, because size2 is sorted by center order.. # so we don't know how to reorder size the same way # we could just sort the two of them, for some bit of comparison. if sorted(size) != sorted(size2): raise Exception( "trial: %s training cluster sizes: %s not the same as predict on same data: %s" % (trial, size, size2)) # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram expectedSize = [t[1] / SCALE_SIZE for t in expected] if size2 != expectedSize: raise Exception( "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize)) if DELETE_KEYS_EACH_ITER: h2i.delete_keys_at_all_nodes()
def test_KMeans_fuzzy_centers_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if DO_TWO_CLUSTER: genCenters = [ [100, 100, 100, 100, 100, 100], [200, 200, 200, 200, 200, 200], ] genCenters = [ [100, 100], [200, 200], ] else: genCenters = [ [100, 100, 100, 100, 100, 100], [110, 110, 110, 110, 110, 110], [120, 120, 120, 120, 120, 120], [130, 130, 130, 130, 130, 130], ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) rowCount = 10000 expected = [(g, rowCount, None) for g in genCenters] allowedDelta = (0.2, 0.2, 0.2, 0.2, 0.2, 0.2) allowedDelta = (0.2, 0.2) worstError = None bestError = None timeoutSecs = 60 hex_key = 'cA' print "Generate synthetic dataset with first column constant = 0 and see what KMeans does" csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname dataset = write_syn_dataset(csvPathname, rowCount, genCenters, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex") print "Parse result['destination_key']:", parseResult[ 'destination_key'] allErrors = [] for trial in range(10): seed = random.randint(0, sys.maxint) kwargs = { 'k': len(genCenters), 'initialization': 'PlusPlus', 'destination_key': 'k.hex', 'max_iter': 1000 } kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=60, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) # save the predicted h2o.nodes[0].csv_download(src_key='d', csvPathname='kmeans_predict.csv') # check center list (first center) has same number of cols as source data self.assertEqual( len(genCenters[0]), len(centers[0]), "kmeans first center doesn't have same # of values as dataset row %s %s" % (len(genCenters[0]), len(centers[0]))) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) if h2o.beta_features: error = kmeans['model']['total_within_SS'] within_cluster_variances = kmeans['model'][ 'within_cluster_variances'] print "trial:", trial, "within_cluster_variances:", within_cluster_variances else: model_key = kmeans["_key"] kmeansResult = h2o_cmd.runInspect(key=model_key) error = kmeansResult['KMeansModel']['error'] # compute the sum of the squares of the distance for each cluster # for each row, we # returns a tuple of numers for each center genDistances = calc_best_distance(centers, dataset) print "trial:", trial, "genDistances:", genDistances print "trial:", trial, "centers:", centers print "trial:", trial, "error:", error if (abs(genDistances - error)) > (.001 * genDistances): raise Exception( "genDistances: %s error: %s are too different" % (genDistances, error)) if not bestError or error < bestError: print 'Found smaller error:', error bestError = error bestCenters = centers bestSeed = seed bestTrial = trial if not worstError or error > worstError: print 'Found larger error:', error worstError = error allErrors.append(error) print "bestTrial:", bestTrial print "bestError:", bestError print "worstError:", worstError print "bestCenters:", bestCenters print "bestSeed:", bestSeed print "allErrors:", allErrors
def test_kmeans_iris_fvec(self): h2o.beta_features = True csvFilename = 'iris.csv' csvPathname = 'iris/' + csvFilename print "\nStarting", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key) k = 3 ignored_cols = 'C5' for trial in range(3): # reuse the same seed, to get deterministic results (otherwise sometimes fails kwargs = { 'ignored_cols': ignored_cols, # ignore the output 'k': k, 'max_iter': 25, 'initialization': 'Furthest', 'destination_key': 'iris.hex', 'seed': 0, } timeoutSecs = 90 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [ # if ignored_cols isn't used # ([5, 3.4, 1.46, 0.244, 0.0], 50, 15.24) , # ([5.9, 2.76, 4.26, 1.33, 1.02], 51, 32.9) , # ([6.6, 2.98, 5.57, 2.03, 2.0], 49, 39.15) , ([ 5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999 ], 50, 15.240400000000003), ([ 5.901612903225807, 2.748387096774194, 4.393548387096775, 1.4338709677419357 ], 62, 39.82096774193549), ([ 6.8500000000000005, 3.073684210526315, 5.742105263157894, 2.0710526315789473 ], 38, 23.87947368421053), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial) gs = h2o.nodes[0].gap_statistic(source=hex_key, ignored_cols=ignored_cols, k_max=k) print "gap_statistic:", h2o.dump_json(gs) k_best = gs['gap_model']['k_best'] self.assertTrue(k_best != 0, msg="k_best shouldn't be 0: %s" % k_best)
def test_KMeans_covtype_fvec(self): csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) for trial in range(2): kwargs = { 'k': 6, 'initialization': 'Furthest', # 'initialization': '', # 'ignored_cols': range(11, inspect['numCols']), # ignore the response 'ignored_cols_by_name': 'C55', 'max_iter': 100, # 'normalize': 0, # reuse the same seed, to get deterministic results 'seed': 265211114317615310 } start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs) expected = [ ([ 2781.64184460309, 162.69950733599902, 16.545275983574268, 243.73547234768156, 50.48239522121315, 942.4480922085701, 208.3915356763203, 218.7135425941215, 140.10956243018794, 1040.6795741397266, 0.22024185323685105, 0.0845245225799837, 0.4957505706376572, 0.19948305354550802, 0.01635558145683929, 0.033196811983660604, 0.026025394050259283, 0.04566180477986607, 0.008617572941792261, 0.03547936261257615, 0.0, 0.0, 0.006189327591882107, 0.13606268110663236, 0.037222303163733886, 0.024007252359445064, 0.040891651692487006, 0.003232264365769295, 1.6188302332734367e-05, 0.004667627172605076, 0.00910861811255187, 9.173371321882807e-05, 0.0025415634662392956, 0.008946735089224526, 0.0023095311328034363, 0.04957397784361021, 0.09252154393235448, 0.03887890610245037, 0.0, 0.0, 0.0010792201555156243, 0.004867282901375466, 0.08281935473426902, 0.045640220376755754, 0.04933654940939677, 0.08426550974265995, 0.07772003949945769, 0.001327440791284218, 0.0014191745045030462, 0.0, 0.0, 0.009513325670870229, 0.010970272880816322, 0.009443176360761713 ], 185319, 116283720155.37769), ([ 2892.8730376693256, 119.94759695676377, 11.22516236778623, 189.0301354611245, 24.621525329374652, 2631.9842642419744, 219.94967526442753, 223.3794395991835, 135.71226572647987, 5409.1797365002785, 0.883243644460939, 0.11675635553906105, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0015587307478196325, 0.0, 0.0, 0.0, 0.23410651326776769, 0.0, 0.0, 0.0, 0.026498422712933754, 0.0, 0.04152904063833735, 0.005158656522545927, 0.0695490814622379, 0.0, 0.0634997216552236, 0.05418444980515866, 0.010391538318797551, 0.0002969010948227871, 0.0, 0.0, 0.0, 0.3677862312117276, 0.07596956763778066, 0.0, 0.01109667841900167, 0.005641120801632956, 0.0, 0.0018185192057895714, 0.0, 0.0, 0.0021154203006123586, 0.018444980515865652, 0.010354425681944703 ], 26945, 46932273891.61873), ([ 3022.020861415003, 137.8546989122598, 13.3449108178427, 282.99227296949937, 45.23691263596753, 1606.0215197015768, 216.64941537882825, 222.64791856054669, 137.40339644525253, 2529.4366555907336, 0.4113429046111407, 0.08617284724616782, 0.5024842481426914, 0.0, 0.0, 0.0052506191028494405, 0.0, 0.014176671577693489, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018949249239835743, 0.029850161436945546, 0.05403435628977148, 0.020892761982382997, 0.0, 0.0, 0.0018494718033917432, 0.011731607159650168, 0.005979436381304661, 0.0047098837027052445, 0.013714303626845553, 0.0007601642581737249, 0.047788470580859534, 0.10631328171530674, 0.04641704021817498, 0.0036519231372057308, 0.011872668568383437, 0.0, 0.00034481677690354536, 0.17267483777937995, 0.044473527475627724, 0.05637754302372967, 0.1292435973793925, 0.11970627880003762, 0.0013871038525438075, 0.004858781856368139, 0.0, 0.0, 0.03151155136202627, 0.028988119494686687, 0.012491771417823892 ], 127604, 95229063588.02844), ([ 3051.365089986695, 168.1268450579292, 14.114846831985933, 287.6101588092033, 50.702549817536706, 2835.266162979793, 209.89460702308608, 226.92302305495684, 148.84282479633362, 1461.8985753079312, 0.3284728328107128, 0.0006069141527711857, 0.670920253036516, 0.0, 0.0, 0.0054700083256172235, 0.0, 0.01653452018767653, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03886584862938554, 0.013250959002170886, 0.04277966681969203, 0.05480901656564399, 0.0, 0.0, 0.0010426473906581905, 0.0018440853103432178, 0.0, 0.0035014278044491476, 0.011671426014830491, 0.002435437561761296, 0.044405885511091744, 0.10662236712081483, 0.042756323967662366, 0.0, 0.007384122192049426, 0.006263665294625696, 0.0, 0.14390868276285998, 0.022152366576148275, 0.07071327974851968, 0.14799368186805065, 0.1011367968938445, 0.009111493242244337, 0.006427065258833325, 0.0009259331305098857, 0.002318723301612991, 0.03055579330682623, 0.041044514818820564, 0.024074261393257027 ], 128519, 106432862495.53804), ([ 3052.088693852026, 149.15056174929376, 11.549996765359152, 328.4748452763461, 44.2420589567205, 4786.68757682272, 215.8348392383499, 226.91413106764713, 143.9780260065124, 4192.589071226791, 0.8949819938326181, 0.0, 0.10501800616738188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0022642485929312314, 0.002415198499126647, 0.0, 0.00012938563388178466, 0.0, 0.1351648588618377, 0.0, 0.0, 0.0, 0.014836219351777974, 0.0, 0.0, 0.010674314795247235, 0.03553792077286352, 0.0, 0.039290104155435275, 0.09289888512712138, 0.03864317598602636, 0.0, 0.0, 0.0, 0.0, 0.4371509283419232, 0.08636491061609126, 0.0003665926293317232, 0.002717098311517478, 0.017100467944709204, 0.0, 0.0028249196730856323, 0.0, 0.0, 0.03226015138119164, 0.017316110667845514, 0.03204450865805533 ], 46373, 77991941653.19676), ([ 3119.4885286481917, 165.13178470083923, 11.672206122079334, 271.2690333876713, 39.407851838435064, 4959.81440560285, 212.5861709835175, 227.95909557447322, 148.6725381875264, 1613.4457676749382, 0.9052556903942522, 0.0, 0.09474430960574776, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00037734709895550323, 0.0, 0.0, 0.0, 0.008346917828895732, 0.0021584254060254783, 0.0, 0.0, 0.0031395278633097865, 0.0, 0.0, 0.02815009358208054, 0.012512829801364487, 0.0, 0.13355068526233171, 0.11424560767976816, 0.008799734347642335, 0.0, 0.0018867354947775161, 0.0012226046006158305, 0.0, 0.44056028497252914, 0.10774014369377528, 0.0033810300066413087, 0.014580691903640641, 0.02313892410795146, 0.0002565960272897422, 3.018776791644026e-05, 0.0, 0.0, 0.06503954597597053, 0.022625732053371973, 0.008256354525146411 ], 66252, 74666940350.2879), ] ### print h2o.dump_json(kmeans) predictKey = 'd' (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeansResult, csvPathname, parseResult, predictKey, **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # these clusters were sorted compared to the cluster order in training h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial) # why is the expected # of rows not right in KMeans2. That means predictions are wrong h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial) print "Trial #", trial, "completed\n"
def test_kmeans_prostate(self): importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList # loop, to see if we get same centers expected = [ (None, [0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36955), (None, [0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045), (None, [0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412), ] # all are multipliers of expected tuple value allowedDelta = (0.02, 0.02, 0.02) labelListUsed = list(labelList) labelListUsed.remove('ID') numColsUsed = numCols - 1 for trial in range(5): # kmeansSeed = random.randint(0, sys.maxint) # actually can get a slightly better error sum with a different seed # this seed gets the same result as scikit (at least in h2o1) # kmeansSeed = 6655548259421773879 kmeansSeed = 7037878434240420762 parameters = { 'validation_frame': parse_key, 'ignored_columns': "['ID']", 'k': 3, 'max_iterations': 500, 'standardize': False, 'seed': kmeansSeed, # PlusPlus init seems bad here..should investigate 'init': 'Furthest', } model_key = 'prostate_k.hex' bmResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def notest_kmeans_benign(self): importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [ (None, [8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), (None, [33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), (None, [27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), (None, [26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01, 0.01) # loop, to see if we get same centers # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(5): kmeansSeed = random.randint(0, sys.maxint) # kmeansSeed = 6655548259421773879 parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'k': 4, 'max_iterations': 50, 'standardize': False, 'seed': kmeansSeed, 'init': 'Furthest', } model_key = 'benign_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) # zip with * is it's own inverse here. It's sorted by centers for easy comparisons # changed..old order: ids, mses, rows, centers = zip(*km.tuplesSorted) # new order: # ids, centers, rows, errors = zip(*km.tuplesSorted) # create a tuple for each cluster, then sort by row # old. this was going to do a predict and a summary (histogram) (old h2o1 needed this for more info) # (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeansResult, csvPathname, parseResult, 'd', parameters) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta) # Not seeing any scoring results yet? cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_c5_KMeans_sphere_67MB_fvec(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() csvFilename = 'syn_sphere_gen_h1m_no_na.csv' totalBytes = 67306997 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # clear out all NAs (walk across cols)..clear to 0 # temp ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key) ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500) h2o_cmd.infoFromInspect(inspect, csvPathname) summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500) h2o_cmd.infoFromSummary(summary) # KMeans **************************************** if not DO_KMEANS: continue print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'max_iter': 10, 'normalize': 1, 'initialization': 'Furthest', 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work } if (trial%3)==0: kwargs['initialization'] = 'PlusPlus' elif (trial%3)==1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "kmeans result:", h2o.dump_json(kmeans) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial) h2i.delete_keys_at_all_nodes()
def kmeans_doit(self, csvFilename, bucket, csvPathname, numRows, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=20) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun kwargs = { 'k': 1, 'initialization': 'Furthest', 'destination_key': 'KMeansModel.hex', 'max_iter': 25, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) expected = [([ -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154 ], numRows, None)] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0) # compare this kmeans to the first one. since the files are replications, the results # should be similar? # inspect doesn't work # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key']) # KMeansModel = inspect['KMeansModel'] modelView = h2o.nodes[0].kmeans_view(model='KMeansModel.hex') h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView)) model = modelView['model'] clusters = model['centers'] within_cluster_variances = model['within_cluster_variances'] total_within_SS = model['total_within_SS'] print "within_cluster_variances:", within_cluster_variances print "total_within_SS:", total_within_SS if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList numColsUsed = numCols labelListUsed = labelList ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? destination_key = 'syn_spheres100.hex' cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(2): parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'score_each_iteration': False, 'k': CLUSTERS, 'max_iterations': 50, 'standardize': False, # 'seed': kmeansSeed, 'init': 'Furthest', } timeoutSecs = 100 model_key = 'sphere100_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=timeoutSecs) modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) # no expected row/error? expected = [(None, c, None, None) for c in centersList] expected.sort(key=lambda tup: sum(tup[1])) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta=[.01, .01, .01]) print "Trial #", trial, "completed"
def test_B_kmeans_benign(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = "benign.csv" key2 = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename h2i.setupImportFolder(None, importFolderPath) # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right? parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\nStarting", csvFilename expected = [ ([ 24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117 ], 77, 46889.32010560476), ([ 25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282 ], 114, 64011.20272144667), ([ 30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667 ], 12, 13000.485226507595), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # loop, to see if we get same centers for trial in range(2): kwargs = { 'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } # for fvec only? kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5}) kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_KMeans_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "/datasets/kmeans_big" csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. if FROM_HDFS: importFolderResult = h2i.setupImportHdfs( None, importFolderPath) else: importFolderResult = h2i.setupImportFolder( None, importFolderPath) # PARSE **************************************** print "Parse starting: " + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseKey = h2i.parseImportHdfsFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) # KMeans **************************************** print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'initialization': 'Furthest', 'epsilon': 1e-6, 'cols': None, 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial % 3) == 0: kwargs['initialization'] = 'PlusPlus' elif (trial % 3) == 1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)