def test_1mx10_hastie_10_2_cat_and_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        bucket = 'datasets'
        csvPathname = 'logreg' + '/' + csvFilename
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x)
        glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
    def test_1mx10_hastie_10_2_cat_and_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        kmeans_doit(self, csvFilename, csvPathname, num_rows=1000000, timeoutSecs=60)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        kmeans_doit(self, filename2xShuf, pathname2xShuf, num_rows=2000000, timeoutSecs=90)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x)
        kmeans_doit(self, filename4x, pathname4x, num_rows=4000000, timeoutSecs=120)
    def test_KMeans_hastie_shuffle_fvec(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard/' + csvFilename
        bucket = 'home-0xdiag-datasets'
        kmeans_doit(self, csvFilename, bucket, csvPathname, numRows=1000000, timeoutSecs=60)
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        kmeans_doit(self, filename2xShuf, None, pathname2xShuf, numRows=2000000, timeoutSecs=90)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x)
        kmeans_doit(self, filename4x, None, pathname4x, numRows=4000000, timeoutSecs=120)
    def test_kmeans_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex")

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(10):
            kwargs = {
                'k': CLUSTERS, 
                'initialization': 'Furthest', 
                'cols': cols,
                'destination_key': 'syn_spheres100.hex'
            }
            timeoutSecs = 100
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            # print h2o.dump_json(kmeansResult)

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']

            # the way we create the centers above, if we sort on the sum of xyz
            # we should get the order the same as when they were created.
            # to be safe, we'll sort the centers that were generated too, the same way
            clustersSorted = sorted(clusters, key=sum)
            centersSorted  = sorted(centersList, key=sum)
            ### print clustersSorted

            print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)"
            cf = '{0:6.2f}'
            for c in clustersSorted:
                print ' '.join(map(cf.format,c))

            print "\ngenerated centers (sorted by key=sum)"
            for c in centersSorted:
                print ' '.join(map(cf.format,c))
            
            for i,center in enumerate(centersSorted):
                # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem.
                # Assuming that the difference between adjacent sums of all center values, 
                # is greater than 2x the sum of all max allowed variance on each value, 
                # Then the sums will be unique and non-overlapping with allowed variance.
                # So a sort of the centers, keyed on sum of all values for a center.
                # will create an ordering that can be compared. 
                # sort gen'ed and actual separately.
                # Adjacent center hamming distance check is done during gen above.
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)

                for i, v in enumerate(a):
                    emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct."
                    self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg)

            print "Trial #", trial, "completed"
    def test_kmeans_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex")
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        numColsUsed = numCols
        labelListUsed = labelList

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        destination_key = 'syn_spheres100.hex'
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(2):
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                'k': CLUSTERS,
                'max_iterations': 50,
                'standardize': False,
                # 'seed': kmeansSeed,
                'init': 'Furthest', # [u'Random', u'PlusPlus', u'Furthest', u'User']
                # 'dropNA20Cols': False,
                # 'user_points': userPointsKey
            }

            timeoutSecs = 100
            model_key = 'sphere100_k.hex'
            kmeansResult = h2o.n0.build_model(
                algo='kmeans',
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=timeoutSecs)

            modelResult = h2o.n0.models(key=model_key)
            km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed)

            # no expected row/error?
            expected = [(None, c, None, None) for c in centersList] 
            expected.sort(key=lambda tup: sum(tup[1]))
            h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta=[.01, .01, .01])

            print "Trial #", trial, "completed"
Beispiel #6
0
        for n in range(numPts):
            interestingEnum = getInterestingEnum()
            thisPt = currentCenter[:]
            xyz = get_xyz_sphere(R) 
            for i in range(3):
                thisPt[xyzShift+i] += int(xyz[i])
            dsf.write(",".join(map(str,[interestingEnum] + thisPt))+"\n")
            totalRows += 1

	sphereCnt += 1 # end of while loop

    dsf.close()
    print "Spheres created:", len(centersList), "totalRows:", totalRows
    return centersList


#*****************************************************
csvFilename = 'syn_sphere_gen.csv'
csvPathname = './' + csvFilename
centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

if SHUFFLE_SPHERES:
    # since we create spheres in order
    csvFilename2 = 'syn_sphere_gen_shuffled.csv'
    csvPathname2 = './' + csvFilename2
    import h2o_util
    h2o_util.file_shuffle(csvPathname, csvPathname2)
else:
    csvFilename2 = csvFilename
    csvPathname2 = csvPathname
Beispiel #7
0
        for n in range(numPts):
            interestingEnum = getInterestingEnum()
            thisPt = currentCenter[:]
            xyz = get_xyz_sphere(R)
            for i in range(3):
                thisPt[xyzShift + i] += int(xyz[i])
            dsf.write(",".join(map(str, [interestingEnum] + thisPt)) + "\n")
            totalRows += 1

        sphereCnt += 1  # end of while loop

    dsf.close()
    print "Spheres created:", len(centersList), "totalRows:", totalRows
    return centersList


#*****************************************************
csvFilename = 'syn_sphere_gen.csv'
csvPathname = './' + csvFilename
centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

if SHUFFLE_SPHERES:
    # since we create spheres in order
    csvFilename2 = 'syn_sphere_gen_shuffled.csv'
    csvPathname2 = './' + csvFilename2
    import h2o_util
    h2o_util.file_shuffle(csvPathname, csvPathname2)
else:
    csvFilename2 = csvFilename
    csvPathname2 = csvPathname
    def test_kmeans_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex")
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        numColsUsed = numCols
        labelListUsed = labelList

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        destination_key = 'syn_spheres100.hex'
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(2):
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                'score_each_iteration': False,
                'k': CLUSTERS,
                'max_iters': 50,
                'standardize': False,
                # 'seed': kmeansSeed,
                'init': 'Furthest',
            }

            timeoutSecs = 100
            model_key = 'sphere100_k.hex'
            kmeansResult = h2o.n0.build_model(
                algo='kmeans',
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=timeoutSecs)

            start = time.time()
            modelResult = h2o.n0.models(key=model_key)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # this prints too
            km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed)
            h2o_cmd.runStoreView()

            # zip with * is it's own inverse here. It's sorted by centers for easy comparisons
            ids, mses, rows, clusters = zip(*km.tuplesSorted)


            # the way we create the centers above, if we sort on the sum of xyz
            # we should get the order the same as when they were created.
            # to be safe, we'll sort the centers that were generated too, the same way
            clustersSorted = sorted(clusters, key=sum)
            centersSorted  = sorted(centersList, key=sum)

            kmeansResult = modelResult
            ### print clustersSorted

            print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)"
            cf = '{0:6.2f}'
            for c in clustersSorted:
                print ' '.join(map(cf.format,c))

            print "\ngenerated centers (sorted by key=sum)"
            for c in centersSorted:
                print ' '.join(map(cf.format,c))
            
            for i,center in enumerate(centersSorted):
                # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem.
                # Assuming that the difference between adjacent sums of all center values, 
                # is greater than 2x the sum of all max allowed variance on each value, 
                # Then the sums will be unique and non-overlapping with allowed variance.
                # So a sort of the centers, keyed on sum of all values for a center.
                # will create an ordering that can be compared. 
                # sort gen'ed and actual separately.
                # Adjacent center hamming distance check is done during gen above.
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)

                for i, v in enumerate(a):
                    emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct."
                    self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg)

            print "Trial #", trial, "completed"
    def test_kmeans2_sphere100(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex")

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(10):
            kwargs = {
                'k': CLUSTERS, 
                'initialization': 'Furthest', 
                'destination_key': 'syn_spheres100.hex',
                'max_iter': 15,
            }
            timeoutSecs = 100
            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # can't inspect a kmeans2 model
            # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            # print h2o.dump_json(kmeansResult)

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

            # cluster centers can return in any order
            model = kmeansResult['model']
            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]


            # the way we create the centers above, if we sort on the sum of xyz
            # we should get the order the same as when they were created.
            # to be safe, we'll sort the centers that were generated too, the same way
            clustersSorted = sorted(clusters, key=sum)
            centersSorted  = sorted(centersList, key=sum)
            ### print clustersSorted

            print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)"
            cf = '{0:6.2f}'
            for c in clustersSorted:
                print ' '.join(map(cf.format,c))

            print "\ngenerated centers (sorted by key=sum)"
            for c in centersSorted:
                print ' '.join(map(cf.format,c))
            
            for i,center in enumerate(centersSorted):
                # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem.
                # Assuming that the difference between adjacent sums of all center values, 
                # is greater than 2x the sum of all max allowed variance on each value, 
                # Then the sums will be unique and non-overlapping with allowed variance.
                # So a sort of the centers, keyed on sum of all values for a center.
                # will create an ordering that can be compared. 
                # sort gen'ed and actual separately.
                # Adjacent center hamming distance check is done during gen above.
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)

                for i, v in enumerate(a):
                    emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct."
                    self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg)

            print "Trial #", trial, "completed"