Exemple #1
0
    def test_kmeans_benign(self):
        h2o.beta_features = True
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"

        csvPathname = importFolderPath + "/" + csvFilename
        # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right?
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([
                8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53,
                2.12, 128.61, 35.33, 1.57
            ], 49, None),
            ([
                33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30,
                0.37, 2.52, 125.40, 43.91, 1.79
            ], 87, None),
            ([
                27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53,
                0.58, 2.89, 171.27, 42.73, 1.53
            ], 55, None),
            ([
                26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44,
                0.22, 2.89, 234.56, 39.22, 1.56
            ], 9, None),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        for trial in range(1):
            kmeansSeed = random.randint(0, sys.maxint)
            # kmeansSeed = 6655548259421773879

            kwargs = {
                'k': 4,
                'initialization': 'PlusPlus',
                'destination_key': 'benign_k.hex',
                # 'seed': 265211114317615310,
                'max_iter': 50,
                'seed': kmeansSeed,
            }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=5,
                                       **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def kmeans_doit(self, csvFilename, bucket, csvPathname, num_rows, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseResult = h2i.import_parse(
        bucket=bucket, path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=10
    )
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    kwargs = {
        "k": 1,
        "initialization": "Furthest",
        "destination_key": "KMeansModel.hex",
        "max_iter": 25,
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        "seed": 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeans(
        parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs
    )
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
        (elapsed / timeoutSecs) * 100
    )

    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, "d", **kwargs)

    expected = [
        (
            [
                -0.0006628900000000158,
                -0.0004671200060434639,
                0.0009330300069879741,
                0.0007883800000000272,
                0.0007548200000000111,
                0.0005617899864856153,
                0.0013246499999999897,
                0.0004036299999999859,
                -0.0014307100000000314,
                0.0021324000161308796,
                0.00154,
            ],
            num_rows,
            None,
        )
    ]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    inspect = h2o_cmd.runInspect(None, key=kmeans["destination_key"])
    KMeansModel = inspect["KMeansModel"]
    clusters = KMeansModel["centers"][0]
    print "clusters:", h2o.dump_json(clusters)

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
    def test_KMeans_constant_col_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 10),
            (100, 10, 'cB', 10),
            (100, 9, 'cC', 10),
            (100, 8, 'cD', 10),
            (100, 7, 'cE', 10),
            (100, 6, 'cF', 10),
            (100, 5, 'cG', 10),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print "Generate synthetic dataset with first column constant = 0 and see what KMeans does"
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseResult['destination_key']

            kwargs = {'k': 2, 'initialization': 'Furthest', 'destination_key': 'benign_k.hex', 'max_iter': 25}
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # check center list (first center) has same number of cols as source data
            self.assertEqual(colCount, len(centers[0]),
                "kmeans first center doesn't have same # of values as dataset row %s %s" % (colCount, len(centers[0])))
Exemple #4
0
    def test_B_kmeans_benign(self):
        h2o.beta_features = True
        csvPathname = "logreg"
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=csvFilename+".hex", noPoll=True, doSummary=False)
        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

        expected = [
            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for trial in range(2):
            params = {'k': 3, 
                      'initialization': 'Furthest', 
                      'ignored_cols' : None, 
                      'destination_key': 'benign_k.hex',
                      'max_iter': 50,
                      'seed': 265211114317615310,
                     }
            kwargs = params.copy()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Exemple #5
0
    def test_KMeans_covtype_cols_fvec(self):
        h2o.beta_features = True
        # just do the import folder once
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("covtype.binary.svm", "cC", 30, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        importFolderPath = "libsvm"
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # KMEANS******************************************
            for trial in range(1):
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'ignored_cols': range(11, numCols),
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                for trial2 in range(3):
                    timeoutSecs = 600
                    start = time.time()
                    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                    elapsed = time.time() - start
                    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                    # this does an inspect of the model and prints the clusters
                    h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
    def test_KMeans2_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseResult['destination_key']

            kwargs = {
                'k': 2, 
                'initialization': 'Furthest', 
                'destination_key': 'benign_k.hex',
                'max_iter': 10,
            }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
    def test_KMeans_constant_col_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print "Generate synthetic dataset with first column constant = 0 and see what KMeans does"
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseResult['destination_key']

            kwargs = {'k': 2, 'initialization': 'Furthest', 'destination_key': 'benign_k.hex', 'max_iter': 25}
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # check center list (first center) has same number of cols as source data
            self.assertEqual(colCount, len(centers[0]),
                "kmeans first center doesn't have same # of values as dataset row %s %s" % (colCount, len(centers[0])))
Exemple #8
0
    def test_C_kmeans_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex")
        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117) ,
            ([63.93984962406015], 133, 611.5187969924812) ,
            ([71.55307262569832], 179, 1474.2458100558654) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            params = {'k': 3, 
                     'initialization': 'Furthest', 
                     'ignored_cols': "ID",
                     'destination_key': 'prostate_k.hex',
                     'max_iter': 100,
                     'seed': 265211114317615310
                    }
            kwargs = params.copy()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Exemple #9
0
    def test_C_kmeans_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', 
            path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex")

        # loop, to see if we get same centers
        # this was sklearn.cluster.Kmeans with first col removed. num_rows and error is 0 here 
        expected = [
            ([0.36, 66.44,  1.09,  2.21,  1.06, 10.84, 34.16,  6.31], 136, 46045),
            ([0.37, 65.77,  1.07,  2.23,  1.11, 10.49,  4.24,  6.31], 215, 36956), 
            ([0.83, 66.17,  1.21,  2.86,  1.34, 73.30, 15.57,  7.31], 29,  33412),
            ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.1, 0.1, 0.1)
        # try saving best!
        bestError = None
        for trial in range(10):

            seed = random.randint(0, sys.maxint)
            seed = 7509839924844349324

            if h2o.beta_features:
                params = {'k': 3, 
                         # 'initialization': 'Furthest', 
                         'initialization': 'PlusPlus',
                         'ignored_cols': "ID",
                         'destination_key': 'prostate_k.hex',
                         'max_iter': 1000,
                         'seed': seed
                        }
            else:
                params = {'k': 3, 
                         # 'initialization': 'Furthest', 
                         'initialization': 'PlusPlus',
                         'cols': 'CAPSULE, AGE, RACE, DPROS, DCAPS, PSA, VOL, GLEASON',
                         'destination_key': 'prostate_k.hex',
                         'max_iter': 100,
                         'seed': seed
                        }

            kwargs = params.copy()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
            error = kmeans['model']['error']
            if not bestError or error < bestError:
                print 'Found smaller error:', error
                bestError = error
                bestCenters = centers
                bestSeed = seed
                bestTrial = trial
            
        print "bestTrial:", bestTrial
        print "bestError:", bestError
        print "bestCenters:", bestCenters
        print "bestSeed:", bestSeed
Exemple #10
0
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10

            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE,
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols':
                    None,  #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Exemple #11
0
    def test_KMeans2_winesPCA(self):
        h2o.beta_features = True
        csvPathname = 'winesPCA.csv'
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10)
        print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o.check_sandbox_for_errors()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        kwargs = {
            'initialization': 'Furthest',
            # 'initialization': '',
            # 'initialization': 'PlusPlus',
            'max_iter': 50,
            'k': 3,
            'seed': '265211114317615310',
        }

        timeoutSecs = 480

        # try the same thing 5 times
        for trial in range (10):
            start = time.time()

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = \
                h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster

            # now compare expected vs actual. By sorting on center, we should be able to compare
            # since the centers should be separated enough to have the order be consistent
            if OLD_KMEANS:
                expected = [
                    ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794),
                    ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745),
                    ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474),
                    ]
            else:
                # error:  258.051462872
                expected = [
                    ([-2.23406681758209, -0.7729819755373136], 67, 96.85372611195429),
                    ([0.25174392601612905, 1.792222172419355], 62, 99.21823733913352),
                    ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474),
                        ]

            # multipliers on the expected values for allowed
            # within 2% of best with random seeds?
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
    def test_kmeans_prostate(self):
        h2o.beta_features = True  # fvec

        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers

        expected = [
            ([0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36955),
            ([0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045),
            ([0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(1):
            # kmeansSeed = random.randint(0, sys.maxint)
            # actually can get a slightly better error sum with a different seed
            # this seed gets the same result as scikit
            kmeansSeed = 6655548259421773879

            kwargs = {
                "ignored_cols": "ID",
                "k": 3,
                # 'initialization': 'Furthest',
                "initialization": "PlusPlus",
                "destination_key": "prostate_k.hex",
                "max_iter": 500,
                "seed": kmeansSeed,
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                # 'seed': 265211114317615310}
            }

            # for fvec only?
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            # FIX! how do I get the kmeans result?
            ### print "kmeans result:", h2o.dump_json(kmeans)
            # can't do this
            # inspect = h2o_cmd.runInspect(key='prostate_k.hex')
            modelView = h2o.nodes[0].kmeans_view(model="prostate_k.hex")
            h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))

            model = modelView["model"]
            clusters = model["centers"]
            within_cluster_variances = model["within_cluster_variances"]
            total_within_SS = model["total_within_SS"]
            print "within_cluster_variances:", within_cluster_variances
            print "total_within_SS:", total_within_SS
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, "d", **kwargs
            )
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
    def test_kmeans_sphere5(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        for trial in range(5):
            # pass SEED so it's repeatable
            kwargs = {
                'k': CLUSTERS, 
                'max_iter': 10,
                'initialization': 'Furthest', 
                'cols': None, 
                'destination_key': 'syn_spheres100.hex', 
                'seed': SEED
            }
            timeoutSecs = 30
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']
            clustersSorted = sorted(clusters, key=itemgetter(0))
            ### print clustersSorted

            print "\nh2o result, centers sorted"
            print clustersSorted
            print "\ngenerated centers"
            print centersList
            for i,center in enumerate(centersList):
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)
                self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" x not correct.")
                self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" y not correct.")
                self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" z not correct.")

            print "Trial #", trial, "completed"
    def test_kmeans_benign(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"

        csvPathname = importFolderPath + "/" + csvFilename
        # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right?
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(
            bucket="smalldata", path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180, doSummary=False
        )

        inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
        print "\nStarting", csvFilename

        expected = [
            ([8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None),
            ([33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None),
            ([27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None),
            ([26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        for trial in range(1):
            kmeansSeed = random.randint(0, sys.maxint)
            # kmeansSeed = 6655548259421773879

            kwargs = {
                "k": 4,
                "initialization": "PlusPlus",
                "destination_key": "benign_k.hex",
                # 'seed': 265211114317615310,
                "max_iter": 50,
                "seed": kmeansSeed,
            }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)

            ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans))
            modelView = h2o.nodes[0].kmeans_view(model="benign_k.hex")
            h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
            model = modelView["model"]
            clusters = model["centers"]
            within_cluster_variances = model["within_cluster_variances"]
            total_within_SS = model["total_within_SS"]
            print "within_cluster_variances:", within_cluster_variances
            print "total_within_SS:", total_within_SS

            # make this fvec legal?
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, "d", **kwargs
            )
Exemple #15
0
    def test_KMeans_twit_fvec(self):
        h2o.beta_features = True
        csvFilename = "Twitter2DB.txt"
        print "\nStarting", csvFilename

        # h2b.browseTheCloud()
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')

        # both of these centers match what different R/Scikit packages get
        expected1 = [
            # expected centers are from R. rest is just from h2o
            ([310527.2, 13433.89], 11340, None),
            ([5647967.1, 40487.76], 550, None),
            ([21765291.7, 93129.26], 14, None),
        ]

        # this is what we get with Furthest
        expected2 = [
            ([351104.74065255735,
              15421.749823633158], 11340, 5021682274541967.0),
            ([7292636.589090909, 7575.630909090909], 550, 6373072701775582.0),
            ([34406781.071428575, 244878.0], 14, 123310713697348.92),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.0001, 0.0001, 0.0001)
        for trial in range(2):
            kwargs = {
                'k': 3,
                'max_iter': 50,
                'normalize': 0,
                'initialization': 'Furthest',
                # 'initialization': 'PlusPlus',
                'destination_key': 'kmeans_dest_key',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }
            init_choices = ['Furthest', 'PlusPlus']
            kwargs['initialization'] = init_choices[trial % len(init_choices)]

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=5,
                                       **kwargs)
            # can't inspect a kmeans2 model?
            # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key'], verbose=True)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected2,
                                                allowedDelta,
                                                trial=trial)
    def test_B_kmeans_benign(self):
        h2o.beta_features = True # fvec
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"

        csvPathname = importFolderPath + "/" + csvFilename
        # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right?
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, 
            timeoutSecs=180, noPoll=not DO_POLL, doSummary=False)

        if not DO_POLL:
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            parseResult['destination_key'] = hex_key
        
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([10.5, 2.8, 40.3, 0.0, 12.0, 0.8, 1.6, 21.1, 11.4, 0.7, 2.9, 206.2, 36.7, 1.5], 15, 0) ,
            ([23.72897196261682, 2.3271028037383177, 44.81308411214953, 0.34579439252336447, 13.093457943925234, 1.4579439252336448, 1.3177570093457944, 24.16129367150993, 13.317757009345794, 0.5071931108136043, 2.6604011393039024, 121.6822429906542, 40.13084112149533, 1.691588785046729], 110, 0) ,
            ([29.2625, 2.7, 48.5125, 0.1625, 12.0625, 1.0375, 1.4875, 23.023665714263917, 12.6875, 0.5073033705353737, 3.090870788693428, 160.95, 43.3, 1.65], 71, 0) ,
            ([38.333333333333336, 2.3333333333333335, 52.666666666666664, 0.0, 14.333333333333334, 2.3333333333333335, 1.6666666666666667, 25.85955047607422, 12.0, 0.5056179761886597, 3.2846442063649497, 261.6666666666667, 43.0, 1.0], 4, 0) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        if DO_IGNORE:
            kwargs = {'k': 4, 'ignored_cols': 'STR', 'destination_key': 'benign_k.hex', 'seed': 265211114317615310, 'max_iter': 50}
        else:
            kwargs = {'k': 4, 'ignored_cols': None, 'destination_key': 'benign_k.hex', 'seed': 265211114317615310, 'max_iter': 50}

        kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=not DO_POLL, **kwargs)

        if not DO_POLL:
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            # hack..supposed to be there like va
            kmeans['destination_key'] = 'benign_k.hex'
        ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans))
        modelView = h2o.nodes[0].kmeans_model_view(model='benign_k.hex')
        h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
        model = modelView['model']
        clusters = model['clusters']
        cluster_variances = model['cluster_variances']
        error = model['error']
        print "cluster_variances:", cluster_variances
        print "error:", error

        # make this fvec legal?
        (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
        h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
Exemple #17
0
    def test_KMeans_winesPCA(self):
        csvPathname = 'winesPCA.csv'
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put',
                                       timeoutSecs=10)
        print "parse end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o.check_sandbox_for_errors()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        kwargs = {
            #appears not to take 'cols'?
            'cols': None,
            'initialization': 'Furthest',
            'k': 3,
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            'seed': 265211114317615310,
        }

        timeoutSecs = 480

        # try the same thing 5 times
        for trial in range(10):
            start = time.time()

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = \
                h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster

            # now compare expected vs actual. By sorting on center, we should be able to compare
            # since the centers should be separated enough to have the order be consistent
            expected = [
                ([-2.25977535371875,
                  -0.8631572635625001], 64, 83.77800617624794),
                ([0.16232721958461543,
                  1.7626161107230771], 65, 111.64440134649745),
                ([2.7362112930204074,
                  -1.2107751495102044], 49, 62.6290553489474),
            ]
            # multipliers on the expected values for allowed
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList,
                                                expected, allowedDelta, trial)
Exemple #18
0
    def test_KMeansGrid_basic_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
            ]

        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path='standard/covtype.data',
                                           schema='local',
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "python_source:", parseResult['python_source']
            csvPathname = parseResult['python_source']

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            destination_key = 'd.hex'
            params = {
                'k': '2,3',
                # 'initialization': 'Furthest',
                'initialization': None,
                'seed': 3923021996079663354,
                'normalize': 0,
                'max_iter': '2',
                'destination_key': destination_key
            }

            for trial in range(3):
                kwargs = params.copy()
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans (with grid) end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                print "FIX! have to interrogate each job result to see kmeans grid results"
                ### h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                # This doesn't work (inspecting the model)
                # inspect = h2o_cmd.runInspect(None,key=destination_key)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_KMeansGrid_params_rand2_fvec(self):
        if h2o.localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ("covtype.data", 800)
            ]
        else:
            csvFilenameList = [("covtype.data", 800)]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60
            )
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + ".hex"
                params = {"k": "2,3", "destination_key": destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(
                    parseResult=parseResult,
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=2,
                    pollTimeoutSecs=60,
                    noPoll=True,
                    **kwargs
                )
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                    (elapsed / timeoutSecs) * 100
                )
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Exemple #20
0
    def test_kmeans_prostate(self):
        h2o.beta_features = True  # fvec

        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       header=1,
                                       timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers

        expected = [
            ([0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36955),
            ([0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045),
            ([0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(1):
            # kmeansSeed = random.randint(0, sys.maxint)
            # actually can get a slightly better error sum with a different seed
            # this seed gets the same result as scikit
            kmeansSeed = 6655548259421773879

            kwargs = {
                'ignored_cols': 'ID',
                'k': 3,
                # 'initialization': 'Furthest',
                'initialization': 'PlusPlus',
                'destination_key': 'prostate_k.hex',
                'max_iter': 500,
                'seed': kmeansSeed,
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                # 'seed': 265211114317615310}
            }

            # for fvec only?
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=5,
                                       **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)
    def test_C_kmeans_prostate(self):
        h2o.beta_features = True # fvec

        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers
        expected = [
            ([43.07058823529412, 0.36470588235294116, 67.70588235294117, 1.1058823529411765, 2.3529411764705883, 1.2117647058823529, 17.33529411764706, 14.201176470588232, 6.588235294117647], 103, 0) ,
            ([166.04347826086956, 0.4658385093167702, 66.09316770186335, 1.0807453416149069, 2.3043478260869565, 1.0807453416149069, 15.0632298136646, 16.211118012422357, 6.527950310559007], 136, 0) ,
            ([313.4029850746269, 0.35074626865671643, 64.91791044776119, 1.0820895522388059, 2.1791044776119404, 1.0746268656716418, 14.601492537313437, 16.35686567164179, 6.082089552238806], 141, 0) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        kwargs = {'k': 3, 'initialization': 'Furthest', 'destination_key': 'prostate_k.hex', 'max_iter': 50,
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            'seed': 265211114317615310}

        # for fvec only?
        kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            # hack..supposed to be there like va
            kmeans['destination_key'] = 'prostate_k.hex'
        # FIX! how do I get the kmeans result?
        ### print "kmeans result:", h2o.dump_json(kmeans)
        # can't do this
        # inspect = h2o_cmd.runInspect(key='prostate_k.hex')
        modelView = h2o.nodes[0].kmeans_model_view(model='prostate_k.hex')
        h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))

        model = modelView['model']
        clusters = model['clusters']
        cluster_variances = model['cluster_variances']
        error = model['error']
        print "cluster_variances:", cluster_variances
        print "error:", error
        # variance of 0 might be legal with duplicated rows. wasn't able to remove the duplicate rows of NAs at 
        # bottom of benign.csv in ec2
        # for i,c in enumerate(cluster_variances):
        #    if c < 0.1:
        #        raise Exception("cluster_variance %s for cluster %s is too small. Doesn't make sense. Ladies and gentlemen, this is Chewbacca. Chewbacca is a Wookiee from the planet Kashyyyk. But Chewbacca lives on the planet Endor. Now think about it...that does not make sense!" % (c, i))
        

        # make this fvec legal?
        (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
        h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
Exemple #22
0
    def test_kmeans_sphere3_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key)

        for trial in range(10):
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            kwargs = {
                'k': 3,
                'max_iter': 25,
                'initialization': 'Furthest',
                'destination_key': 'spheres3.hex',
                # 'seed': 265211114317615310,
                'seed': 0,
            }

            timeoutSecs = 90
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            expected = [
                ([100, 100, 100], 1000000, 60028168),
                ([200, 200, 200], 2000000, 479913618),
                ([300, 300, 300], 3000000, 1619244994),
            ]
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)

            gs = h2o.nodes[0].gap_statistic(source=hex_key,
                                            k_max=5,
                                            timeoutSecs=300)
            print "gap_statistic:", h2o.dump_json(gs)
    def test_KMeansGrid_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + '.hex'
                params = {'k': '2,3', 'destination_key': destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_KMeans_twit(self):
        csvFilename = "Twitter2DB.txt"
        print "\nStarting", csvFilename

        # h2b.browseTheCloud()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=csvFilename + ".hex", schema='put')

        # both of these centers match what different R/Scikit packages get
        expected1 = [
                # expected centers are from R. rest is just from h2o
                ([310527.2, 13433.89], 11340, None),
                ([5647967.1, 40487.76], 550, None),
                ([21765291.7, 93129.26], 14,  None),
            ]

        # this is what we get with Furthest
        expected2 = [
                ([351104.74065255735, 15421.749823633158], 11340, 5021682274541967.0) ,
                ([7292636.589090909, 7575.630909090909], 550, 6373072701775582.0) ,
                ([34406781.071428575, 244878.0], 14, 123310713697348.92) ,
            ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.0001, 0.0001, 0.0001)
        for trial in range(2):
            kwargs = {
                'k': 3, 
                'max_iter': 50,
                'normalize': 0,
                'cols': '0,1',
                'initialization': 'Furthest', 
                # 'initialization': 'PlusPlus',
                'destination_key': 'kmeans_dest_key',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }
            init_choices = ['Furthest', 'PlusPlus']
            kwargs['initialization'] = init_choices[trial % len(init_choices)]

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'], verbose=True)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)

            if 1==0:
                h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore")
                h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply")
                h2b.browseJsonHistoryAsUrlLastMatch("KMeans")
                # Comment sleep out to get a clean grep.
                # time.sleep(3600)

            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected2, allowedDelta, trial=trial)
Exemple #25
0
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10
        
            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Exemple #26
0
    def test_KMeans_covtype20x_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 1200, 'cA'),
            ]
        else:
            # None is okay for hex_key
            csvFilenameList = [
                ('covtype20x.data', 1200, 'cA'),
                # ('covtype200x.data', 1000,'cE'),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs, hex_key in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()
            parseResult = h2i.import_parse(
                bucket='home-0xdiag-datasets',
                path=csvPathname,
                timeoutSecs=2000,
                hex_key=hex_key)  # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            k = 2
            kwargs = {
                'max_iter': 25,
                'initialization': 'Furthest',
                'k': k,
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310,
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)
    def test_KMeansGrid_basic_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path='standard/covtype.data', schema='local',
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "python_source:", parseResult['python_source']
            csvPathname = parseResult['python_source']
            
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            destination_key = 'd.hex'
            params = {
                'k': '2,3', 
                # 'initialization': 'Furthest', 
                'initialization': None,
                'seed': 3923021996079663354, 
                'normalize': 0, 
                'max_iter': '2',
                'destination_key': destination_key
            }
    
            for trial in range(3):
                kwargs = params.copy()
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans (with grid) end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                print "FIX! have to interrogate each job result to see kmeans grid results"
                ### h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                # This doesn't work (inspecting the model)
                # inspect = h2o_cmd.runInspect(None,key=destination_key)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Exemple #28
0
    def test_kmeans_iris_fvec(self):
        h2o.beta_features = True
        csvFilename = 'iris.csv'
        csvPathname = 'iris/' + csvFilename

        print "\nStarting", csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key)

        k = 3
        ignored_cols = 'C5'
        for trial in range(3):
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            kwargs = {
                'ignored_cols': ignored_cols, # ignore the output
                'k': k, 
                'max_iter': 25,
                'initialization': 'Furthest',
                'destination_key': 'iris.hex', 
                'seed': 0,
                }

            timeoutSecs = 90
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            expected = [
                # if ignored_cols isn't used
                # ([5, 3.4, 1.46, 0.244, 0.0], 50, 15.24) ,
                # ([5.9, 2.76, 4.26, 1.33, 1.02], 51, 32.9) ,
                # ([6.6, 2.98, 5.57, 2.03, 2.0], 49, 39.15) ,
                ([5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999], 50, 15.240400000000003) ,
                ([5.901612903225807, 2.748387096774194, 4.393548387096775, 1.4338709677419357], 62, 39.82096774193549) ,
                ([6.8500000000000005, 3.073684210526315, 5.742105263157894, 2.0710526315789473], 38, 23.87947368421053) ,
            ]
            

            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)

            gs = h2o.nodes[0].gap_statistic(source=hex_key, ignored_cols=ignored_cols, k_max=k)
            print "gap_statistic:", h2o.dump_json(gs)

            k_best = gs['gap_model']['k_best']
            self.assertTrue(k_best!=0, msg="k_best shouldn't be 0: %s" % k_best)
    def test_KMeans_covtype20x_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 1200, 'cA'),
                ]
        else:
            # None is okay for hex_key
            csvFilenameList = [
                ('covtype20x.data', 1200,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs, hex_key in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                timeoutSecs=2000, hex_key=hex_key) # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            k = 2
            kwargs = {
                'max_iter': 25,
                'initialization': 'Furthest',
                'k': k, 
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310,
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            gs = h2o.nodes[0].gap_statistic(source=hex_key, k_max=8)
            print "gap_statistic:", h2o.dump_json(gs)
    def test_B_kmeans_benign(self):
        importFolderPath = "standard"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"

        csvPathname = importFolderPath + "/" + csvFilename
        # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right?
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, header=1, 
            timeoutSecs=180, noPoll=h2o.beta_features, doSummary=False)

        if h2o.beta_features:
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            parseResult['destination_key'] = hex_key
        
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for k in range(2, 6):
            kwargs = {'k': k, 'ignored_cols_by_name': None, 'destination_key': 'benign_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            # for fvec only?
            kwargs.update({'max_iter': 10})
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=h2o.beta_features, **kwargs)

            if h2o.beta_features:
                h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
                # hack..supposed to be there like va
                kmeans['destination_key'] = 'benign_k.hex'
            ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans))
            modelView = h2o.nodes[0].kmeans_model_view(model='benign_k.hex')
            h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
            model = modelView['model']
            clusters = model['clusters']
            cluster_variances = model['cluster_variances']
            error = model['error']
            print "cluster_variances:", cluster_variances
            print "error:", error
    def test_KMeansGrid_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            h2o.beta_features = True # no grid for VA
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + '.hex'
                params = {'k': 'c(2,3)', 'destination_key': destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()
        
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_C_kmeans_prostate(self):

        importFolderPath = "standard"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117) ,
            ([63.93984962406015], 133, 611.5187969924812) ,
            ([71.55307262569832], 179, 1474.2458100558654) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for k in range(2, 6):
            kwargs = {'k': k, 'initialization': 'Furthest', 'destination_key': 'prostate_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            # for fvec only?
            kwargs.update({'max_iter': 50})

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, noPoll=h2o.beta_features, **kwargs)
            if h2o.beta_features:
                h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
                # hack..supposed to be there like va
                kmeans['destination_key'] = 'prostate_k.hex'
            # FIX! how do I get the kmeans result?
            ### print "kmeans result:", h2o.dump_json(kmeans)
            # can't do this
            # inspect = h2o_cmd.runInspect(key='prostate_k.hex')
            modelView = h2o.nodes[0].kmeans_model_view(model='prostate_k.hex')
            h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))

            model = modelView['model']
            clusters = model['clusters']
            cluster_variances = model['cluster_variances']
            error = model['error']
            print "cluster_variances:", cluster_variances
            print "error:", error
            for i,c in enumerate(cluster_variances):
                if c < 0.1:
                    raise Exception("cluster_variance %s for cluster %s is too small. Doesn't make sense. Ladies and gentlemen, this is Chewbacca. Chewbacca is a Wookiee from the planet Kashyyyk. But Chewbacca lives on the planet Endor. Now think about it...that does not make sense!" % (c, i))
    def test_KMeans_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'max_iter': 20,
                    'k': 1,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
    def test_KMeans_winesPCA(self):
        csvPathname = 'winesPCA.csv'
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10)
        print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o.check_sandbox_for_errors()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        kwargs = {
            #appears not to take 'cols'?
            'cols': None,
            'initialization': 'Furthest',
            'k': 3,
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            'seed': 265211114317615310,
        }

        timeoutSecs = 480

        # try the same thing 5 times
        for trial in range (10):
            start = time.time()

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = \
                h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster

            # now compare expected vs actual. By sorting on center, we should be able to compare
            # since the centers should be separated enough to have the order be consistent
            expected = [
                ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794) ,
                ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745) ,
                ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474) ,
            ]
            # multipliers on the expected values for allowed
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def kmeans_doit(self, csvFilename, bucket, csvPathname, numRows, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=20)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    kwargs = {
        'k': 1, 
        'initialization': 'Furthest',
        'destination_key': 'KMeansModel.hex',
        'max_iter': 25,
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

    expected = [
        ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], numRows, None)
    ]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    # inspect doesn't work
    # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key'])
    # KMeansModel = inspect['KMeansModel']
    modelView = h2o.nodes[0].kmeans_view(model='KMeansModel.hex')
    h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
    model = modelView['model']
    clusters = model['centers']
    within_cluster_variances = model['within_cluster_variances']
    total_within_SS = model['total_within_SS']
    print "within_cluster_variances:", within_cluster_variances
    print "total_within_SS:", total_within_SS
    
    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
    def test_kmeans_sphere3_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_spheres3_" + str(SEED) + ".csv"
        csvPathname = SYNDATASETS_DIR + "/" + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=hex_key)

        for trial in range(10):
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            kwargs = {
                "k": 3,
                "max_iter": 25,
                "initialization": "Furthest",
                "destination_key": "spheres3.hex",
                # 'seed': 265211114317615310,
                "seed": 0,
            }

            timeoutSecs = 90
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, "d", **kwargs
            )

            expected = [
                ([100, 100, 100], 1000000, 60028168),
                ([200, 200, 200], 2000000, 479913618),
                ([300, 300, 300], 3000000, 1619244994),
            ]
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)

            gs = h2o.nodes[0].gap_statistic(source=hex_key, k_max=5, timeoutSecs=300)
            print "gap_statistic:", h2o.dump_json(gs)
    def test_KMeans_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'max_iter': 20, 
                    'k': 1, 
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
Exemple #38
0
    def test_kmeans_prostate(self):
        h2o.beta_features = True # fvec

        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers

        expected = [
            ([0.37,65.77,1.07,2.23,1.11,10.49,4.24,6.31], 215, 36955), 
            ([0.36,66.44,1.09,2.21,1.06,10.84,34.16,6.31], 136, 46045),
            ([0.83,66.17,1.21,2.86,1.34,73.30,15.57,7.31], 29, 33412),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(1):
            # kmeansSeed = random.randint(0, sys.maxint)
            # actually can get a slightly better error sum with a different seed
            # this seed gets the same result as scikit
            kmeansSeed = 6655548259421773879

            kwargs = {
                'ignored_cols': 'ID',
                'k': 3, 
                # 'initialization': 'Furthest', 
                'initialization': 'PlusPlus',
                'destination_key': 'prostate_k.hex', 
                'max_iter': 500,
                'seed': kmeansSeed,
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                # 'seed': 265211114317615310}
            }

            # for fvec only?
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Exemple #39
0
    def test_kmeans_benign(self):
        h2o.beta_features = True
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"

        csvPathname = importFolderPath + "/" + csvFilename
        # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right?
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, header=1, 
            timeoutSecs=180, doSummary=False)

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename


        expected = [
            ([8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), 
            ([33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), 
            ([27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), 
            ([26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), 
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        for trial in range(1):
            kmeansSeed = random.randint(0, sys.maxint)
            # kmeansSeed = 6655548259421773879

            kwargs = {
                'k': 4, 
                'initialization': 'PlusPlus',
                'destination_key': 'benign_k.hex', 
                # 'seed': 265211114317615310, 
                'max_iter': 50,
                'seed': kmeansSeed,
            }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Exemple #40
0
    def notest_B_kmeans_benign(self):
        h2o.beta_features = True
        csvPathname = "logreg"
        csvFilename = "benign.csv"
        hex_key = csvFilename + ".hex"
        print "\nStarting", csvFilename
        
        parseResult = h2i.import_parse(bucket='smalldata', 
            path=csvPathname + "/"+csvFilename, schema='local', hex_key=hex_key)

        # FIX! have to fill in expected rows and error here
        # this is from sklearn.cluster.KMeans, with NA's converted to 0
        expected = [
            ([ 8.86,  2.43, 35.53,  0.31, 13.22,  1.47,  1.33, 20.06, 13.08,  0.53,  2.12, 128.61, 35.33,  1.57], None, None),
            ([33.47,  2.29, 50.92,  0.34, 12.82,  1.33,  1.36, 21.43, 13.30,  0.37,  2.52, 125.40, 43.91,  1.79], None, None),
            ([27.64,  2.87, 48.11,  0.09, 11.80,  0.98,  1.51, 21.02, 12.53,  0.58,  2.89, 171.27, 42.73,  1.53], None, None),
            ([26.00,  2.67, 46.67,  0.00, 13.00,  1.33,  1.67, 21.56, 11.44,  0.22,  2.89, 234.56, 39.22,  1.56], None, None),
            ]

        
        for i in range(14):
            execExpr = '%s[,%s] = is.na(%s[,%s]) ? 0.0 : %s[,%s]' % (hex_key,i+1,hex_key,i+1,hex_key,i+1)
            h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=4)


        # all are multipliers of expected tuple value
        allowedDelta = (0.1, 0.1, 0.1)

        # loop, to see if we get same centers
        for trial in range(2):
            params = {'k': 4, 
                      # 'initialization': 'Furthest', 
                      'initialization': 'PlusPlus', 
                      'destination_key': 'benign_k.hex',
                      'max_iter': 100,
                      'seed': 265211114317615310,
                     }
            kwargs = params.copy()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
    def test_KMeans_covtype_fvec(self):
        h2o.beta_features = True
        csvFilenameList = [
            ('covtype.data', 800),
        ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            for trial in range(3):
                kwargs = {
                    'source': u'covtype.hex',
                    'destination_key': 'covtype.data_2.hex',
                    'initialization': 'Furthest',
                    # 'max_iter': 20,
                    'max_iter': 50,
                    'k': 2,
                }

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
    def test_kmeans_sphere3_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)

        for trial in range(10):
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            kwargs = {
                'k': 3, 
                'max_iter': 25,
                'initialization': 'Furthest',
                'destination_key': 'spheres3.hex', 
                # 'seed': 265211114317615310,
                'seed': 0,
                }

            timeoutSecs = 90
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            expected = [
                ([100, 100, 100], 1000000,   60028168),
                ([200, 200, 200], 2000000,  479913618),
                ([300, 300, 300], 3000000, 1619244994),
            ]
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'allstate'
        csvFilename = "train_set.csv"
        csvPathname = importFolderPath + "/" + csvFilename
        timeoutSecs = 600
        trialMax = 3
        for trial in range(trialMax):
            trialStart = time.time()
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='s3n',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print 'h2o reported parse time:', parseResult['response']['time']
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {'cols': None, 'initialization': 'Furthest', 'k': 12}

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
Exemple #44
0
    def test_c5_KMeans_sphere_67MB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        csvFilename = 'syn_sphere_gen_h1m_no_na.csv'
        totalBytes = 67306997
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # clear out all NAs (walk across cols)..clear to 0
            # temp
            ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key)
            ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15, 
                'max_iter': 10,
                'normalize': 1,
                'initialization': 'Furthest',
                'destination_key': 'junk.hex', 
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
                # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work
                }

            if (trial%3)==0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial%3)==1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeans)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
            h2i.delete_keys_at_all_nodes()
Exemple #45
0
    def test_kmeans_sphere5(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='put',
                                       hex_key=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        for trial in range(5):
            # pass SEED so it's repeatable
            kwargs = {
                'k': CLUSTERS,
                'max_iter': 10,
                'initialization': 'Furthest',
                'cols': None,
                'destination_key': 'syn_spheres100.hex',
                'seed': SEED
            }
            timeoutSecs = 30
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']
            clustersSorted = sorted(clusters, key=itemgetter(0))
            ### print clustersSorted

            print "\nh2o result, centers sorted"
            print clustersSorted
            print "\ngenerated centers"
            print centersList
            for i, center in enumerate(centersList):
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b  # h2o result
                aStr = ",".join(map(str, a))
                bStr = ",".join(map(str, b))
                iStr = str(i)
                self.assertAlmostEqual(a[0],
                                       b[0],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " x not correct.")
                self.assertAlmostEqual(a[1],
                                       b[1],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " y not correct.")
                self.assertAlmostEqual(a[2],
                                       b[2],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " z not correct.")

            print "Trial #", trial, "completed"
    def test_c5_KMeans_sphere_26GB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = 'syn_sphere15_gen_26GB.csv'
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                ([
                    -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                    31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                    -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    19.00092954923767, -10.999565572612255, 90.00028669073289,
                    1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                    30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                    16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                    -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                    11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                    -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    147.00394564757505, 122.98729664236723, 311.0047920137008,
                    2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]
        else:
            expected = [
                ([
                    0.0, -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    0.0, 5.0, 2.0, 340.0, 1817.995920197288,
                    33970406.992053084, 31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    0.0, 10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    0.0, 12.0, 3.0, 168.0, -4066.995950679284,
                    41077063.00269915, -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    0.0, 19.00092954923767, -10.999565572612255,
                    90.00028669073289, 1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    0.0, 20.0, 0.0, 141.0, -3263.0030236302937,
                    6163210.990273981, 30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    0.0, 21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    0.0, 39.0, 3.0, 470.0, -3337.9880599007597,
                    28768057.98852736, 16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    0.0, 40.0, 1.0, 145.0, 950.9990795199593,
                    14602680.991458317, -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    0.0, 42.0, 4.0, 479.0, -3678.0033024834297,
                    8209673.001421165, 11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    0.0, 48.0, 4.0, 71.0, -951.0035145455234,
                    49882273.00063991, -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    0.0, 147.00394564757505, 122.98729664236723,
                    311.0047920137008, 2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='hdfs',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='local',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'],
                                         timeoutSecs=300)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summary = h2o_cmd.runSummary(key=parseResult['destination_key'],
                                         numRows=numRows,
                                         numCols=numCols,
                                         timeoutSecs=300)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'max_iter': 500,
                # 'normalize': 1,
                'normalize': 0,  # temp try
                'initialization': 'Furthest',
                'destination_key': 'junk.hex',
                # we get NaNs if whole col is NA
                'ignored_cols': 'C1',
                'normalize': 0,
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             benchmarkLogging=benchmarkLogging,
                                             **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self,
                                               tupleResultList,
                                               expected,
                                               trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=False,
                                                allowRowError=True,
                                                trial=trial)

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult['model']
            size = model['size']
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (
                    max_iter -
                    1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter))

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2))

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s"
                    % (trial, size, expectedSize))

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
Exemple #47
0
    def test_KMeans2_sphere5_inits(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        expectedCenters = write_spheres_dataset(csvPathname, CLUSTERS,
                                                SPHERE_PTS)

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='put',
                                       hex_key=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        savedResults = []
        Result = collections.namedtuple(
            'Result',
            'trial clusters size cluster_variances error iterations normalized max_iter clustersSorted'
        )

        # save the best for comparison. Print messages when we update best
        sameAsBest = 0
        # big number? to init
        bestResult = Result(None, None, None, None, None, None, None, None,
                            None)
        for trial in range(TRIALS):
            # pass SEED so it's repeatable
            kwargs = {
                'normalize': 0,
                'k': CLUSTERS,
                'max_iter': MAX_ITER,
                'initialization': INIT,
                # 'initialization': 'PlusPlus',
                'destination_key': 'syn_spheres100.hex',
                'seed': SEED
            }

            timeoutSecs = 30
            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # see if we took the full limit to get an answer

            # inspect of model doesn't work
            # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

            model = kmeansResult['model']
            clusters = model["centers"]
            size = model["size"]
            cluster_variances = model["within_cluster_variances"]
            # round to int to avoid fp error when saying "same"
            error = int(model["total_within_SS"])
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            # clustersSorted = sorted(clusters, key=itemgetter(2))
            clustersSorted = sorted(clusters)

            r = Result(
                trial,
                clusters,
                size,
                cluster_variances,
                error,
                iterations,
                normalized,
                max_iter,
                clustersSorted,
            )

            savedResults.append(r)

            if iterations >= (
                    max_iter -
                    1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (iterations, max_iter))

            print "iterations", iterations
            ### print clustersSorted

            # For now, just analyze the one with the lowest error
            # we could analyze how many are not best, and how many are best (maybe just look at error
            print "savedResults, error"
            print r.error
            if bestResult.error and r.error <= bestResult.error:
                sameAsBest += 1
                # we can check that if it has the same error, the sizes should be the same (integer) and reflects centers?
                # should
                if r.size != bestResult.size:
                    raise Exception(
                        "Would expect that if two trials got the same error (rounded to int), the cluster sizes would likely be the same? %s %s"
                        % (r.size, bestResult.size))

            if not bestResult.error:  # init case
                bestResult = r
            elif r.error < bestResult.error:
                print "Trial", r.trial, "has a lower error", r.error, "than current lowest error", bestResult.error
                print "Using it for best now"
                bestResult = r

            print "Trial #", trial, "completed"

        print "\nApparently, %s out of %s trials, got the same best error: %s  (lowest) " % (
            sameAsBest, TRIALS, bestResult.error)
        print "\nh2o best result was from trial %s, centers sorted:" % bestResult.trial
        print bestResult.clustersSorted
        print "\ngenerated centers for comparison"
        print expectedCenters
        for i, center in enumerate(expectedCenters):
            a = center
            bb = bestResult.clustersSorted
            print "bb:", bb
            b = bb[i]
            print "\nexpected:", a
            print "h2o:", b  # h2o result
            aStr = ",".join(map(str, a))
            bStr = ",".join(map(str, b))
            iStr = str(i)
            self.assertAlmostEqual(a[0],
                                   b[0],
                                   delta=1,
                                   msg=aStr + "!=" + bStr +
                                   ". Sorted cluster center " + iStr +
                                   "; x not correct.")
            self.assertAlmostEqual(a[1],
                                   b[1],
                                   delta=1,
                                   msg=aStr + "!=" + bStr +
                                   ". Sorted cluster center " + iStr +
                                   "; y not correct.")
            self.assertAlmostEqual(a[2],
                                   b[2],
                                   delta=1,
                                   msg=aStr + "!=" + bStr +
                                   ". Sorted cluster center " + iStr +
                                   "; z not correct.")
Exemple #48
0
    def test_KMeans_fuzzy_centers_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DO_TWO_CLUSTER:
            genCenters = [
                [100, 100, 100, 100, 100, 100],
                [200, 200, 200, 200, 200, 200],
            ]

            genCenters = [
                [100, 100],
                [200, 200],
            ]

        else:
            genCenters = [
                [100, 100, 100, 100, 100, 100],
                [110, 110, 110, 110, 110, 110],
                [120, 120, 120, 120, 120, 120],
                [130, 130, 130, 130, 130, 130],
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        rowCount = 10000
        expected = [(g, rowCount, None) for g in genCenters]
        allowedDelta = (0.2, 0.2, 0.2, 0.2, 0.2, 0.2)
        allowedDelta = (0.2, 0.2)
        worstError = None
        bestError = None

        timeoutSecs = 60
        hex_key = 'cA'

        print "Generate synthetic dataset with first column constant = 0 and see what KMeans does"
        csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        print "Creating random", csvPathname
        dataset = write_syn_dataset(csvPathname, rowCount, genCenters, SEED)
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='put',
                                       hex_key=csvFilename + ".hex")
        print "Parse result['destination_key']:", parseResult[
            'destination_key']

        allErrors = []
        for trial in range(10):
            seed = random.randint(0, sys.maxint)
            kwargs = {
                'k': len(genCenters),
                'initialization': 'PlusPlus',
                'destination_key': 'k.hex',
                'max_iter': 1000
            }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=60,
                                       **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            # save the predicted
            h2o.nodes[0].csv_download(src_key='d',
                                      csvPathname='kmeans_predict.csv')

            # check center list (first center) has same number of cols as source data
            self.assertEqual(
                len(genCenters[0]), len(centers[0]),
                "kmeans first center doesn't have same # of values as dataset row %s %s"
                % (len(genCenters[0]), len(centers[0])))
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)

            if h2o.beta_features:
                error = kmeans['model']['total_within_SS']
                within_cluster_variances = kmeans['model'][
                    'within_cluster_variances']
                print "trial:", trial, "within_cluster_variances:", within_cluster_variances
            else:
                model_key = kmeans["_key"]
                kmeansResult = h2o_cmd.runInspect(key=model_key)
                error = kmeansResult['KMeansModel']['error']
            # compute the sum of the squares of the distance for each cluster
            # for each row, we
            # returns a tuple of numers for each center
            genDistances = calc_best_distance(centers, dataset)
            print "trial:", trial, "genDistances:", genDistances
            print "trial:", trial, "centers:", centers
            print "trial:", trial, "error:", error
            if (abs(genDistances - error)) > (.001 * genDistances):
                raise Exception(
                    "genDistances: %s error: %s are too different" %
                    (genDistances, error))

            if not bestError or error < bestError:
                print 'Found smaller error:', error
                bestError = error
                bestCenters = centers
                bestSeed = seed
                bestTrial = trial

            if not worstError or error > worstError:
                print 'Found larger error:', error
                worstError = error

            allErrors.append(error)

        print "bestTrial:", bestTrial
        print "bestError:", bestError
        print "worstError:", worstError
        print "bestCenters:", bestCenters
        print "bestSeed:", bestSeed
        print "allErrors:", allErrors
    def test_four_billion_rows(self):
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            ("four_billion_rows.csv", "a.hex"),
            ("four_billion_rows.csv", "b.hex"),
        ]
        for (csvFilename, hex_key) in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            value_size_bytes = inspect['value_size_bytes']
            row_size = inspect['row_size']
            print "\n" + csvFilename, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols), \
                "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                "    row_size:", "{:,}".format(row_size)

            expectedRowSize = num_cols * 1  # plus output
            expectedValueSize = expectedRowSize * num_rows
            self.assertEqual(row_size, expectedRowSize,
                msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                (row_size, expectedRowSize))
            self.assertEqual(value_size_bytes, expectedValueSize,
                msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                (value_size_bytes, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                num_cols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, num_cols))
            self.assertEqual(4 * 1000000000,
                             num_rows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, num_rows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'max_iter': 20,
                'cols': None,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'x': 0,
                'y': 1,
                'n_folds': 0,
                'case_mode': '=',
                'case': 1
            }
            # one coefficient is checked a little more
            colX = 0

            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
Exemple #50
0
    def test_four_billion_rows_fvec(self):
        h2o.beta_features = True
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            "four_billion_rows.csv",
        ]
        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=180,
                                           retryDelaySecs=3)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            byteSize = inspect['byteSize']
            print "\n" + csvFilename, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols), \
                "    byteSize:", "{:,}".format(byteSize)

            expectedRowSize = numCols * 1  # plus output
            # expectedValueSize = expectedRowSize * numRows
            expectedValueSize = 8001271520
            self.assertEqual(byteSize, expectedValueSize,
                msg='byteSize %s is not expected: %s' % \
                (byteSize, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                numCols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, numCols))
            self.assertEqual(4 * 1000000000,
                             numRows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, numRows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'max_iter': 10,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       retryDelaySecs=4,
                                       **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'response': 'C1',
                'n_folds': 0,
                'family': 'binomial',
            }
            # one coefficient is checked a little more
            colX = 1

            # convert to binomial
            execExpr = "A.hex=%s" % parseResult['destination_key']
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
            execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % ('1', '1', 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
            aHack = {'destination_key': "A.hex"}

            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
Exemple #51
0
    def test_KMeans_libsvm_fvec(self):
        h2o.beta_features = True
        # just do the import folder once
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            # FIX! fails KMeansScore
            ("colon-cancer.svm", "cA", 30, 1),
            ("connect4.svm", "cB", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            # multi-label class
            # ("tmc2007_train.svm",  "cJ", 30, 1),
            ("mnist_train.svm", "cM", 30, 1),
            ("duke.svm", "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm", "cF", 120, 1
             ),  #Summary2 is slow with 5001 columns
            ("mushrooms.svm", "cG", 30, 1),
            #        ("news20.svm",         "cH", 120, 1), #Summary2 is very slow - disable for now
            ("syn_6_1000_10.svm", "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        importFolderPath = "libsvm"
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # KMEANS******************************************
            for trial in range(1):
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols':
                    None,  #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Exemple #52
0
    def test_kmeans_iris_fvec(self):
        h2o.beta_features = True
        csvFilename = 'iris.csv'
        csvPathname = 'iris/' + csvFilename

        print "\nStarting", csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key)

        k = 3
        ignored_cols = 'C5'
        for trial in range(3):
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            kwargs = {
                'ignored_cols': ignored_cols,  # ignore the output
                'k': k,
                'max_iter': 25,
                'initialization': 'Furthest',
                'destination_key': 'iris.hex',
                'seed': 0,
            }

            timeoutSecs = 90
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            expected = [
                # if ignored_cols isn't used
                # ([5, 3.4, 1.46, 0.244, 0.0], 50, 15.24) ,
                # ([5.9, 2.76, 4.26, 1.33, 1.02], 51, 32.9) ,
                # ([6.6, 2.98, 5.57, 2.03, 2.0], 49, 39.15) ,
                ([
                    5.005999999999999, 3.4180000000000006, 1.464,
                    0.2439999999999999
                ], 50, 15.240400000000003),
                ([
                    5.901612903225807, 2.748387096774194, 4.393548387096775,
                    1.4338709677419357
                ], 62, 39.82096774193549),
                ([
                    6.8500000000000005, 3.073684210526315, 5.742105263157894,
                    2.0710526315789473
                ], 38, 23.87947368421053),
            ]

            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)

            gs = h2o.nodes[0].gap_statistic(source=hex_key,
                                            ignored_cols=ignored_cols,
                                            k_max=k)
            print "gap_statistic:", h2o.dump_json(gs)

            k_best = gs['gap_model']['k_best']
            self.assertTrue(k_best != 0,
                            msg="k_best shouldn't be 0: %s" % k_best)
    def test_kmeans2_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex")

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(10):
            kwargs = {
                'k': CLUSTERS, 
                'initialization': 'Furthest', 
                'destination_key': 'syn_spheres100.hex',
                'max_iter': 15,
            }
            timeoutSecs = 100
            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # can't inspect a kmeans2 model
            # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            # print h2o.dump_json(kmeansResult)

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

            # cluster centers can return in any order
            model = kmeansResult['model']
            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]


            # the way we create the centers above, if we sort on the sum of xyz
            # we should get the order the same as when they were created.
            # to be safe, we'll sort the centers that were generated too, the same way
            clustersSorted = sorted(clusters, key=sum)
            centersSorted  = sorted(centersList, key=sum)
            ### print clustersSorted

            print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)"
            cf = '{0:6.2f}'
            for c in clustersSorted:
                print ' '.join(map(cf.format,c))

            print "\ngenerated centers (sorted by key=sum)"
            for c in centersSorted:
                print ' '.join(map(cf.format,c))
            
            for i,center in enumerate(centersSorted):
                # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem.
                # Assuming that the difference between adjacent sums of all center values, 
                # is greater than 2x the sum of all max allowed variance on each value, 
                # Then the sums will be unique and non-overlapping with allowed variance.
                # So a sort of the centers, keyed on sum of all values for a center.
                # will create an ordering that can be compared. 
                # sort gen'ed and actual separately.
                # Adjacent center hamming distance check is done during gen above.
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)

                for i, v in enumerate(a):
                    emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct."
                    self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg)

            print "Trial #", trial, "completed"
    def test_KMeans_covtype_fvec(self):
        csvFilenameList = [
            ('covtype.data', 800),
        ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            for trial in range(2):
                kwargs = {
                    'k': 6,
                    'initialization': 'Furthest',
                    # 'initialization': '',
                    # 'ignored_cols': range(11, inspect['numCols']),
                    # ignore the response
                    'ignored_cols_by_name': 'C55',
                    'max_iter': 100,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results
                    'seed': 265211114317615310
                }

                start = time.time()
                kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

                expected = [
                    ([
                        2781.64184460309, 162.69950733599902,
                        16.545275983574268, 243.73547234768156,
                        50.48239522121315, 942.4480922085701,
                        208.3915356763203, 218.7135425941215,
                        140.10956243018794, 1040.6795741397266,
                        0.22024185323685105, 0.0845245225799837,
                        0.4957505706376572, 0.19948305354550802,
                        0.01635558145683929, 0.033196811983660604,
                        0.026025394050259283, 0.04566180477986607,
                        0.008617572941792261, 0.03547936261257615, 0.0, 0.0,
                        0.006189327591882107, 0.13606268110663236,
                        0.037222303163733886, 0.024007252359445064,
                        0.040891651692487006, 0.003232264365769295,
                        1.6188302332734367e-05, 0.004667627172605076,
                        0.00910861811255187, 9.173371321882807e-05,
                        0.0025415634662392956, 0.008946735089224526,
                        0.0023095311328034363, 0.04957397784361021,
                        0.09252154393235448, 0.03887890610245037, 0.0, 0.0,
                        0.0010792201555156243, 0.004867282901375466,
                        0.08281935473426902, 0.045640220376755754,
                        0.04933654940939677, 0.08426550974265995,
                        0.07772003949945769, 0.001327440791284218,
                        0.0014191745045030462, 0.0, 0.0, 0.009513325670870229,
                        0.010970272880816322, 0.009443176360761713
                    ], 185319, 116283720155.37769),
                    ([
                        2892.8730376693256, 119.94759695676377,
                        11.22516236778623, 189.0301354611245,
                        24.621525329374652, 2631.9842642419744,
                        219.94967526442753, 223.3794395991835,
                        135.71226572647987, 5409.1797365002785,
                        0.883243644460939, 0.11675635553906105, 0.0, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0015587307478196325,
                        0.0, 0.0, 0.0, 0.23410651326776769, 0.0, 0.0, 0.0,
                        0.026498422712933754, 0.0, 0.04152904063833735,
                        0.005158656522545927, 0.0695490814622379, 0.0,
                        0.0634997216552236, 0.05418444980515866,
                        0.010391538318797551, 0.0002969010948227871, 0.0, 0.0,
                        0.0, 0.3677862312117276, 0.07596956763778066, 0.0,
                        0.01109667841900167, 0.005641120801632956, 0.0,
                        0.0018185192057895714, 0.0, 0.0, 0.0021154203006123586,
                        0.018444980515865652, 0.010354425681944703
                    ], 26945, 46932273891.61873),
                    ([
                        3022.020861415003, 137.8546989122598, 13.3449108178427,
                        282.99227296949937, 45.23691263596753,
                        1606.0215197015768, 216.64941537882825,
                        222.64791856054669, 137.40339644525253,
                        2529.4366555907336, 0.4113429046111407,
                        0.08617284724616782, 0.5024842481426914, 0.0, 0.0,
                        0.0052506191028494405, 0.0, 0.014176671577693489, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.018949249239835743,
                        0.029850161436945546, 0.05403435628977148,
                        0.020892761982382997, 0.0, 0.0, 0.0018494718033917432,
                        0.011731607159650168, 0.005979436381304661,
                        0.0047098837027052445, 0.013714303626845553,
                        0.0007601642581737249, 0.047788470580859534,
                        0.10631328171530674, 0.04641704021817498,
                        0.0036519231372057308, 0.011872668568383437, 0.0,
                        0.00034481677690354536, 0.17267483777937995,
                        0.044473527475627724, 0.05637754302372967,
                        0.1292435973793925, 0.11970627880003762,
                        0.0013871038525438075, 0.004858781856368139, 0.0, 0.0,
                        0.03151155136202627, 0.028988119494686687,
                        0.012491771417823892
                    ], 127604, 95229063588.02844),
                    ([
                        3051.365089986695, 168.1268450579292,
                        14.114846831985933, 287.6101588092033,
                        50.702549817536706, 2835.266162979793,
                        209.89460702308608, 226.92302305495684,
                        148.84282479633362, 1461.8985753079312,
                        0.3284728328107128, 0.0006069141527711857,
                        0.670920253036516, 0.0, 0.0, 0.0054700083256172235,
                        0.0, 0.01653452018767653, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.03886584862938554, 0.013250959002170886,
                        0.04277966681969203, 0.05480901656564399, 0.0, 0.0,
                        0.0010426473906581905, 0.0018440853103432178, 0.0,
                        0.0035014278044491476, 0.011671426014830491,
                        0.002435437561761296, 0.044405885511091744,
                        0.10662236712081483, 0.042756323967662366, 0.0,
                        0.007384122192049426, 0.006263665294625696, 0.0,
                        0.14390868276285998, 0.022152366576148275,
                        0.07071327974851968, 0.14799368186805065,
                        0.1011367968938445, 0.009111493242244337,
                        0.006427065258833325, 0.0009259331305098857,
                        0.002318723301612991, 0.03055579330682623,
                        0.041044514818820564, 0.024074261393257027
                    ], 128519, 106432862495.53804),
                    ([
                        3052.088693852026, 149.15056174929376,
                        11.549996765359152, 328.4748452763461,
                        44.2420589567205, 4786.68757682272, 215.8348392383499,
                        226.91413106764713, 143.9780260065124,
                        4192.589071226791, 0.8949819938326181, 0.0,
                        0.10501800616738188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.0022642485929312314, 0.002415198499126647, 0.0,
                        0.00012938563388178466, 0.0, 0.1351648588618377, 0.0,
                        0.0, 0.0, 0.014836219351777974, 0.0, 0.0,
                        0.010674314795247235, 0.03553792077286352, 0.0,
                        0.039290104155435275, 0.09289888512712138,
                        0.03864317598602636, 0.0, 0.0, 0.0, 0.0,
                        0.4371509283419232, 0.08636491061609126,
                        0.0003665926293317232, 0.002717098311517478,
                        0.017100467944709204, 0.0, 0.0028249196730856323, 0.0,
                        0.0, 0.03226015138119164, 0.017316110667845514,
                        0.03204450865805533
                    ], 46373, 77991941653.19676),
                    ([
                        3119.4885286481917, 165.13178470083923,
                        11.672206122079334, 271.2690333876713,
                        39.407851838435064, 4959.81440560285,
                        212.5861709835175, 227.95909557447322,
                        148.6725381875264, 1613.4457676749382,
                        0.9052556903942522, 0.0, 0.09474430960574776, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00037734709895550323,
                        0.0, 0.0, 0.0, 0.008346917828895732,
                        0.0021584254060254783, 0.0, 0.0, 0.0031395278633097865,
                        0.0, 0.0, 0.02815009358208054, 0.012512829801364487,
                        0.0, 0.13355068526233171, 0.11424560767976816,
                        0.008799734347642335, 0.0, 0.0018867354947775161,
                        0.0012226046006158305, 0.0, 0.44056028497252914,
                        0.10774014369377528, 0.0033810300066413087,
                        0.014580691903640641, 0.02313892410795146,
                        0.0002565960272897422, 3.018776791644026e-05, 0.0, 0.0,
                        0.06503954597597053, 0.022625732053371973,
                        0.008256354525146411
                    ], 66252, 74666940350.2879),
                ]

                ### print h2o.dump_json(kmeans)
                predictKey = 'd'
                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeansResult, csvPathname, parseResult, predictKey,
                    **kwargs)
                # all are multipliers of expected tuple value
                allowedDelta = (0.01, 0.01, 0.01)
                # these clusters were sorted compared to the cluster order in training
                h2o_kmeans.showClusterDistribution(self,
                                                   tupleResultList,
                                                   expected,
                                                   trial=trial)
                # why is the expected # of rows not right in KMeans2. That means predictions are wrong
                h2o_kmeans.compareResultsToExpected(self,
                                                    tupleResultList,
                                                    expected,
                                                    allowedDelta,
                                                    allowError=False,
                                                    allowRowError=True,
                                                    trial=trial)

                print "Trial #", trial, "completed\n"
    def test_four_billion_rows(self):
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            ("four_billion_rows.csv", "a.hex"),
            ("four_billion_rows.csv", "b.hex"),
        ]
        for (csvFilename, hex_key) in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            byteSize = inspect['byteSize']
            print "\n" + csvFilename, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                numCols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, numCols))
            self.assertEqual(4 * 1000000000,
                             numRows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, numRows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'max_iter': 20,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)

            # Exec to make binomial########################
            execExpr = "%s[,%s]=(%s[,%s]==%s)" % (hex_key, 1 + 1, hex_key,
                                                  1 + 1, 0)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

            # GLM*********************************
            print "\n" + csvFilename
            colX = 0
            kwargs = {
                'response': 'C2',
                'n_folds': 0,
                'cols': colX,
                'alpha': 0,
                'lambda': 0,
                'family': 'binomial',
                # 'link' can be family_default, identity, logit, log, inverse, tweedie
            }
            # one coefficient is checked a little more

            # L2
            timeoutSecs = 900
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Exemple #56
0
def kmeans_doit(self,
                csvFilename,
                bucket,
                csvPathname,
                numRows,
                timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket,
                                   path=csvPathname,
                                   schema='put',
                                   hex_key=csvFilename + ".hex",
                                   timeoutSecs=20)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    kwargs = {
        'k': 1,
        'initialization': 'Furthest',
        'destination_key': 'KMeansModel.hex',
        'max_iter': 25,
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers,
     tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname,
                                                   parseResult, 'd', **kwargs)

    expected = [([
        -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741,
        0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153,
        0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314,
        0.0021324000161308796, 0.00154
    ], numRows, None)]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self,
                                        tupleResultList,
                                        expected,
                                        allowedDelta,
                                        trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    # inspect doesn't work
    # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key'])
    # KMeansModel = inspect['KMeansModel']
    modelView = h2o.nodes[0].kmeans_view(model='KMeansModel.hex')
    h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
    model = modelView['model']
    clusters = model['centers']
    within_cluster_variances = model['within_cluster_variances']
    total_within_SS = model['total_within_SS']
    print "within_cluster_variances:", within_cluster_variances
    print "total_within_SS:", total_within_SS

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Exemple #57
0
    def test_four_billion_rows(self):
        h2o.beta_features = False
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            "four_billion_rows.csv",
        ]
        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=180)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            # forget about checking the bytesize
            print "\n" + csvFilename, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols)

            expectedRowSize = num_cols * 1  # plus output
            # expectedValueSize = expectedRowSize * num_rows

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                num_cols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, num_cols))
            self.assertEqual(4 * 1000000000,
                             num_rows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, num_rows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'cols': 'C1, C2',
                'initialization': 'Furthest',
                'max_iter': 4,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'y': 'C2',
                'n_folds': 0,
                'family': 'binomial',
                'case_mode': '=',
                'case': 1
            }
            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, 'C1', **kwargs)