def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" bucket = 'datasets' csvPathname = 'logreg' + '/' + csvFilename fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x) glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) kmeans_doit(self, csvFilename, csvPathname, num_rows=1000000, timeoutSecs=60) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) kmeans_doit(self, filename2xShuf, pathname2xShuf, num_rows=2000000, timeoutSecs=90) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) kmeans_doit(self, filename4x, pathname4x, num_rows=4000000, timeoutSecs=120)
def test_KMeans_hastie_shuffle_fvec(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard/' + csvFilename bucket = 'home-0xdiag-datasets' kmeans_doit(self, csvFilename, bucket, csvPathname, numRows=1000000, timeoutSecs=60) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) kmeans_doit(self, filename2xShuf, None, pathname2xShuf, numRows=2000000, timeoutSecs=90) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) kmeans_doit(self, filename4x, None, pathname4x, numRows=4000000, timeoutSecs=120)
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(10): kwargs = { 'k': CLUSTERS, 'initialization': 'Furthest', 'cols': cols, 'destination_key': 'syn_spheres100.hex' } timeoutSecs = 100 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') # print h2o.dump_json(kmeansResult) ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] # the way we create the centers above, if we sort on the sum of xyz # we should get the order the same as when they were created. # to be safe, we'll sort the centers that were generated too, the same way clustersSorted = sorted(clusters, key=sum) centersSorted = sorted(centersList, key=sum) ### print clustersSorted print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)" cf = '{0:6.2f}' for c in clustersSorted: print ' '.join(map(cf.format,c)) print "\ngenerated centers (sorted by key=sum)" for c in centersSorted: print ' '.join(map(cf.format,c)) for i,center in enumerate(centersSorted): # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem. # Assuming that the difference between adjacent sums of all center values, # is greater than 2x the sum of all max allowed variance on each value, # Then the sums will be unique and non-overlapping with allowed variance. # So a sort of the centers, keyed on sum of all values for a center. # will create an ordering that can be compared. # sort gen'ed and actual separately. # Adjacent center hamming distance check is done during gen above. a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) for i, v in enumerate(a): emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct." self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg) print "Trial #", trial, "completed"
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList numColsUsed = numCols labelListUsed = labelList ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? destination_key = 'syn_spheres100.hex' cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(2): parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'k': CLUSTERS, 'max_iterations': 50, 'standardize': False, # 'seed': kmeansSeed, 'init': 'Furthest', # [u'Random', u'PlusPlus', u'Furthest', u'User'] # 'dropNA20Cols': False, # 'user_points': userPointsKey } timeoutSecs = 100 model_key = 'sphere100_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=timeoutSecs) modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) # no expected row/error? expected = [(None, c, None, None) for c in centersList] expected.sort(key=lambda tup: sum(tup[1])) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta=[.01, .01, .01]) print "Trial #", trial, "completed"
for n in range(numPts): interestingEnum = getInterestingEnum() thisPt = currentCenter[:] xyz = get_xyz_sphere(R) for i in range(3): thisPt[xyzShift+i] += int(xyz[i]) dsf.write(",".join(map(str,[interestingEnum] + thisPt))+"\n") totalRows += 1 sphereCnt += 1 # end of while loop dsf.close() print "Spheres created:", len(centersList), "totalRows:", totalRows return centersList #***************************************************** csvFilename = 'syn_sphere_gen.csv' csvPathname = './' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_sphere_gen_shuffled.csv' csvPathname2 = './' + csvFilename2 import h2o_util h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname
for n in range(numPts): interestingEnum = getInterestingEnum() thisPt = currentCenter[:] xyz = get_xyz_sphere(R) for i in range(3): thisPt[xyzShift + i] += int(xyz[i]) dsf.write(",".join(map(str, [interestingEnum] + thisPt)) + "\n") totalRows += 1 sphereCnt += 1 # end of while loop dsf.close() print "Spheres created:", len(centersList), "totalRows:", totalRows return centersList #***************************************************** csvFilename = 'syn_sphere_gen.csv' csvPathname = './' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_sphere_gen_shuffled.csv' csvPathname2 = './' + csvFilename2 import h2o_util h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList numColsUsed = numCols labelListUsed = labelList ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? destination_key = 'syn_spheres100.hex' cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(2): parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'score_each_iteration': False, 'k': CLUSTERS, 'max_iters': 50, 'standardize': False, # 'seed': kmeansSeed, 'init': 'Furthest', } timeoutSecs = 100 model_key = 'sphere100_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=timeoutSecs) start = time.time() modelResult = h2o.n0.models(key=model_key) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this prints too km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) h2o_cmd.runStoreView() # zip with * is it's own inverse here. It's sorted by centers for easy comparisons ids, mses, rows, clusters = zip(*km.tuplesSorted) # the way we create the centers above, if we sort on the sum of xyz # we should get the order the same as when they were created. # to be safe, we'll sort the centers that were generated too, the same way clustersSorted = sorted(clusters, key=sum) centersSorted = sorted(centersList, key=sum) kmeansResult = modelResult ### print clustersSorted print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)" cf = '{0:6.2f}' for c in clustersSorted: print ' '.join(map(cf.format,c)) print "\ngenerated centers (sorted by key=sum)" for c in centersSorted: print ' '.join(map(cf.format,c)) for i,center in enumerate(centersSorted): # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem. # Assuming that the difference between adjacent sums of all center values, # is greater than 2x the sum of all max allowed variance on each value, # Then the sums will be unique and non-overlapping with allowed variance. # So a sort of the centers, keyed on sum of all values for a center. # will create an ordering that can be compared. # sort gen'ed and actual separately. # Adjacent center hamming distance check is done during gen above. a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) for i, v in enumerate(a): emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct." self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg) print "Trial #", trial, "completed"
def test_kmeans2_sphere100(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(10): kwargs = { 'k': CLUSTERS, 'initialization': 'Furthest', 'destination_key': 'syn_spheres100.hex', 'max_iter': 15, } timeoutSecs = 100 start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # can't inspect a kmeans2 model # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') # print h2o.dump_json(kmeansResult) ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs) # cluster centers can return in any order model = kmeansResult['model'] clusters = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] # the way we create the centers above, if we sort on the sum of xyz # we should get the order the same as when they were created. # to be safe, we'll sort the centers that were generated too, the same way clustersSorted = sorted(clusters, key=sum) centersSorted = sorted(centersList, key=sum) ### print clustersSorted print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)" cf = '{0:6.2f}' for c in clustersSorted: print ' '.join(map(cf.format,c)) print "\ngenerated centers (sorted by key=sum)" for c in centersSorted: print ' '.join(map(cf.format,c)) for i,center in enumerate(centersSorted): # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem. # Assuming that the difference between adjacent sums of all center values, # is greater than 2x the sum of all max allowed variance on each value, # Then the sums will be unique and non-overlapping with allowed variance. # So a sort of the centers, keyed on sum of all values for a center. # will create an ordering that can be compared. # sort gen'ed and actual separately. # Adjacent center hamming distance check is done during gen above. a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) for i, v in enumerate(a): emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct." self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg) print "Trial #", trial, "completed"