Example #1
0
    def test_kmeans_sphere3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex', 'seed': 265211114317615310}

        timeoutSecs = 30
        start = time.time()
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        elapsed = time.time() - start
        print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

        expected = [
            ([100, 100, 100], 1000000,   60028168),
            ([200, 200, 200], 2000000,  479913618),
            ([300, 300, 300], 3000000, 1619244994),
        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01) 
        h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
Example #2
0
    def test_kmeans_sphere3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex'}
        timeoutSecs = 30
        start = time.time()
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        elapsed = time.time() - start
        print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
        # cluster centers can return in any order
        centersSorted = sorted(centers, key=itemgetter(0))

        self.assertAlmostEqual(centersSorted[0][0],100,delta=.2)
        self.assertAlmostEqual(centersSorted[1][0],200,delta=.2)
        self.assertAlmostEqual(centersSorted[2][0],300,delta=.2)

        self.assertAlmostEqual(centersSorted[0][1],100,delta=.2)
        self.assertAlmostEqual(centersSorted[1][1],200,delta=.2)
        self.assertAlmostEqual(centersSorted[2][1],300,delta=.2)

        self.assertAlmostEqual(centersSorted[0][2],100,delta=.2)
        self.assertAlmostEqual(centersSorted[1][2],200,delta=.2)
        self.assertAlmostEqual(centersSorted[2][2],300,delta=.2)

        show_results(csvPathname, parseKey, model_key, centers, 'd')
Example #3
0
    def test_kmeans_sphere3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_spheres3_" + str(SEED) + ".csv"
        csvPathname = SYNDATASETS_DIR + "/" + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        kwargs = {"k": 3, "epsilon": 1e-6, "cols": None, "destination_key": "spheres3.hex", "seed": 265211114317615310}

        timeoutSecs = 30
        start = time.time()
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        elapsed = time.time() - start
        print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
            (elapsed / timeoutSecs) * 100
        )

        (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, "d", **kwargs)

        expected = [
            ([100, 100, 100], 1000000, 60028168),
            ([200, 200, 200], 2000000, 479913618),
            ([300, 300, 300], 3000000, 1619244994),
        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)
Example #4
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseKey['destination_key']

            kwargs = {'k': 2, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
    def test_KMeans_constant_col(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print "Generate synthetic dataset with first column constant = 0 and see what KMeans does"
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseResult['destination_key']

            kwargs = {'k': 2, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # check center list (first center) has same number of cols as source data
            self.assertEqual(colCount, len(centers[0]),
                "kmeans first center doesn't have same # of values as dataset row %s %s" % (colCount, len(centers[0])))
Example #6
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseKey['destination_key']

            kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
Example #7
0
    def test_B_kmeans_benign(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = "benign.csv"
        key2 = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right?
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for trial in range(2):
            kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            # for fvec only?
            kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5})

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Example #8
0
    def test_C_kmeans_prostate(self):

        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = "prostate.csv"
        key2 = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117) ,
            ([63.93984962406015], 133, 611.5187969924812) ,
            ([71.55307262569832], 179, 1474.2458100558654) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            # for fvec only?
            kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5})

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str,range(11)))
    kwargs = {
        'k': 1, 
        'epsilon': 1e-6,
        'cols': cols, 
        'destination_key': 'KMeansModel.hex'
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
    h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)


    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)
    
    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Example #10
0
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                 key2=csvFilename + ".hex",
                                 timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str, range(11)))
    kwargs = {
        'k': 1,
        'epsilon': 1e-6,
        'cols': cols,
        'destination_key': 'KMeansModel.hex'
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
    h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    ### print h2o.dump_json(inspect)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Example #11
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, "cA", 5),
            (100, 10, "cB", 5),
            (100, 9, "cC", 5),
            (100, 8, "cD", 5),
            (100, 7, "cE", 5),
            (100, 6, "cF", 5),
            (100, 5, "cG", 5),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = "syn_" + str(SEED) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseKey["destination_key"]

            kwargs = {"k": 2, "epsilon": 1e-6, "cols": None, "destination_key": "benign_k.hex"}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, "d", **kwargs)
Example #12
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        kwargs = {'k': 1, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'}
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
        h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
    def test_KMeans_params_rand2(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 400),
                ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params()
            for trial in range(3):
                randomV = paramDict['k']
                k = random.choice(randomV)

                randomV = paramDict['epsilon']
                epsilon = random.choice(randomV)

                randomV = paramDict['cols']
                cols = random.choice(randomV)

                kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'}
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Example #14
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        # loop, to see if we get same centers
        for i in range(2):
            kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
Example #15
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        # loop, to see if we get same centers
        for i in range(2):
            kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
Example #16
0
    def test_B_kmeans_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        expected = [
            ([
                24.538961038961038, 2.772727272727273, 46.89032467532467,
                0.1266233766233766, 12.012142857142857, 1.0105194805194804,
                1.5222727272727272, 22.26039690646432, 12.582467532467534,
                0.5275062016635049, 2.9477601050634767, 162.52136363636365,
                41.94558441558441, 1.661883116883117
            ], 77, 46889.32010560476),
            ([
                25.587719298245613, 2.2719298245614037, 45.64035087719298,
                0.35964912280701755, 13.026315789473685, 1.4298245614035088,
                1.3070175438596492, 24.393307707470925, 13.333333333333334,
                0.5244431302976542, 2.7326039818647745, 122.46491228070175,
                40.973684210526315, 1.6754385964912282
            ], 114, 64011.20272144667),
            ([
                30.833333333333332, 2.9166666666666665, 46.833333333333336,
                0.0, 13.083333333333334, 1.4166666666666667,
                1.5833333333333333, 24.298220973782772, 11.666666666666666,
                0.37640449438202245, 3.404494382022472, 224.91666666666666,
                39.75, 1.4166666666666667
            ], 12, 13000.485226507595),
        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for trial in range(2):
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'cols': None,
                'destination_key': 'benign_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=5,
                                           **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)
Example #17
0
    def test_KMeans_twit(self):
        csvFilename = "Twitter2DB.txt"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/' + csvFilename)

        # h2b.browseTheCloud()
        # parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", separator=9) # force tab sep
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        # loop, to see if we get same centers
        # should check the means?
        # FIX! have to fix these to right answers
        expected = [
            # expected centers are from R. rest is just from h2o
            ([310527.2, 13433.89], 11340, None),
            ([5647967.1, 40487.76], 550, None),
            ([21765291.7, 93129.26], 14, None),
        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            kwargs = {
                'k': 3,
                'max_iter': 50,
                'epsilon': 1e-4,
                'normalize': 0,
                'cols': '0,1',
                'initialization': 'Furthest',
                # 'initialization': 'PlusPlus',
                'destination_key': 'kmeans_dest_key',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=5,
                                           **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            if 1 == 0:
                h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore")
                h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply")
                h2b.browseJsonHistoryAsUrlLastMatch("KMeans")
                time.sleep(3600)

            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)
Example #18
0
    def test_KMeans_winesPCA(self):
        csvPathname = h2o.find_file('smalldata/winesPCA.csv')
        start = time.time()
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10)
        print "parse end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o.check_sandbox_for_errors()
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        kwargs = {
            #appears not to take 'cols'?
            'cols': None,
            'initialization': 'Furthest',
            'k': 3,
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            'seed': 265211114317615310,
        }

        timeoutSecs = 480

        # try the same thing 5 times
        for trial in range(10):
            start = time.time()

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = \
                h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster

            # now compare expected vs actual. By sorting on center, we should be able to compare
            # since the centers should be separated enough to have the order be consistent
            expected = [
                ([-2.25977535371875,
                  -0.8631572635625001], 64, 83.77800617624794),
                ([0.16232721958461543,
                  1.7626161107230771], 65, 111.64440134649745),
                ([2.7362112930204074,
                  -1.2107751495102044], 49, 62.6290553489474),
            ]
            # multipliers on the expected values for allowed
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList,
                                                expected, allowedDelta, trial)
Example #19
0
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                 key2=csvFilename + ".hex",
                                 timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str, range(11)))
    kwargs = {
        'k': 1,
        'epsilon': 1e-6,
        'cols': cols,
        'destination_key': 'KMeansModel.hex',
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers,
     tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname,
                                                   parseKey, 'd', **kwargs)

    expected = [([
        -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741,
        0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153,
        0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314,
        0.0021324000161308796, 0.00154
    ], num_rows, None)]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self,
                                        tupleResultList,
                                        expected,
                                        allowedDelta,
                                        trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Example #20
0
    def test_KMeans_winesPCA(self):
        if localhost:
            csvFilenameList = [
                #with winesPCA2.csv speciy cols = "1,2"
                ('winesPCA.csv', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('winesPCA.csv', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        importFolderPath = os.path.abspath(h2o.find_file('smalldata'))
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, 'winesPCA.csv', importFolderPath, 
                timeoutSecs=2000, key2=key2) # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
		#appears not to take 'cols'?
                'cols': None,
                'epsilon': 1e-6,
                'k': 3
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
	    print "Expected centers: [-2.276318, -0.965151], with 59 rows."
	    print "                  [0.0388763, 1.63886039], with 71 rows."
	    print "		     [2.740469, -1.237816], with 48 rows."
	    model_key = kmeans['destination_key']
	    kmeansScoreResult = h2o.nodes[0].kmeans_score(
	    	key = parseKey['destination_key'], model_key = model_key)
	    score  = kmeansScoreResult['score']
Example #21
0
    def test_KMeans_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 800),
            ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000,
                                                 pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'k': 1,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,
                                             key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_C_kmeans_prostate(self):

        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = "prostate.csv"
        key2 = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        parseKey = h2i.parseImportFolderFile(None,
                                             csvFilename,
                                             importFolderPath,
                                             key2=key2,
                                             header=1,
                                             timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117),
            ([63.93984962406015], 133, 611.5187969924812),
            ([71.55307262569832], 179, 1474.2458100558654),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'cols': 2,
                'destination_key': 'prostate_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }

            # for fvec only?
            kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5})

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=5,
                                           **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)
Example #23
0
    def test_GLM_covtype20x(self):
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('covtype20x.data', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=2000, key2=key2, noise=('JStack', None))
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
                'cols': None,
                'epsilon': 1e-4,
                'k': 2
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)
    def test_kmeans_sphere5(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        for trial in range(5):
            kwargs = {'k': CLUSTERS, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'syn_spheres100.hex'}
            timeoutSecs = 30
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']
            clustersSorted = sorted(clusters, key=itemgetter(0))
            ### print clustersSorted

            print "\nh2o result, centers sorted"
            print clustersSorted
            print "\ngenerated centers"
            print centersList
            for i,center in enumerate(centersList):
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)
                self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" x not correct.")
                self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" y not correct.")
                self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" z not correct.")

            print "Trial #", trial, "completed"
Example #25
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        # loop, to see if we get same centers
        for i in range(2):
            kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            model_key = kmeans['destination_key']
            kmeansResult = h2o_cmd.runInspect(key=model_key)
            centers = kmeansResult['KMeansModel']['clusters']
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            show_results(csvPathname, parseKey, model_key, centers, 'd')
Example #26
0
    def test_KMeans_constant_col(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            print "Generate synthetic dataset with first column constant = 0 and see what KMeans does"
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key2=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            kwargs = {
                'k': 2,
                'initialization': 'Furthest',
                'cols': None,
                'destination_key': 'benign_k.hex'
            }
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=5,
                                           **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            # check center list (first center) has same number of cols as source data
            self.assertEqual(
                colCount, len(centers[0]),
                "kmeans first center doesn't have same # of values as dataset row %s %s"
                % (colCount, len(centers[0])))
Example #27
0
    def test_kmeans_sphere3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        kwargs = {
            'k': 3,
            'epsilon': 1e-6,
            'cols': None,
            'destination_key': 'spheres3.hex'
        }
        timeoutSecs = 30
        start = time.time()
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
        elapsed = time.time() - start
        print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
            (elapsed / timeoutSecs) * 100)

        kmeansResult = h2o_cmd.runInspect(key='spheres3.hex')

        ### print h2o.dump_json(kmeans)
        print h2o.dump_json(kmeansResult)
        h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

        clusters = kmeansResult['KMeansModel']['clusters']

        # cluster centers can return in any order
        clustersSorted = sorted(clusters, key=itemgetter(0))

        self.assertAlmostEqual(clustersSorted[0][0], 100, delta=.2)
        self.assertAlmostEqual(clustersSorted[1][0], 200, delta=.2)
        self.assertAlmostEqual(clustersSorted[2][0], 300, delta=.2)

        self.assertAlmostEqual(clustersSorted[0][1], 100, delta=.2)
        self.assertAlmostEqual(clustersSorted[1][1], 200, delta=.2)
        self.assertAlmostEqual(clustersSorted[2][1], 300, delta=.2)

        self.assertAlmostEqual(clustersSorted[0][2], 100, delta=.2)
        self.assertAlmostEqual(clustersSorted[1][2], 200, delta=.2)
        self.assertAlmostEqual(clustersSorted[2][2], 300, delta=.2)
Example #28
0
    def test_KMeans_winesPCA(self):
        csvPathname = h2o.find_file('smalldata/winesPCA.csv')
        start = time.time()
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10)
        print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o.check_sandbox_for_errors()
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        kwargs = {
            #appears not to take 'cols'?
            'cols': None,
            'initialization': 'Furthest',
            'k': 3,
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            'seed': 265211114317615310,
        }

        timeoutSecs = 480

        # try the same thing 5 times
        for trial in range (10):
            start = time.time()

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = \
                h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster

            # now compare expected vs actual. By sorting on center, we should be able to compare
            # since the centers should be separated enough to have the order be consistent
            expected = [
                ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794) ,
                ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745) ,
                ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474) ,
            ]
            # multipliers on the expected values for allowed
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
Example #29
0
    def test_KMeans_covtype20x(self):
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('covtype20x.data', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=2000, key2=key2) # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
                'cols': None,
                'epsilon': 1e-4,
                'k': 2, 
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310,
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
    def test_KMeans_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 800),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {'k': 1, 'destination_key': csvFilename + "_" + str(trial) + '.hex'}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Example #31
0
    def test_KMeans_twit(self):
        csvFilename = "Twitter2DB.txt"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/' + csvFilename)

        # h2b.browseTheCloud()
        # parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", separator=9) # force tab sep
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        # loop, to see if we get same centers
        # should check the means?
        # FIX! have to fix these to right answers
        expected = [
                # expected centers are from R. rest is just from h2o
                ([310527.2, 13433.89], 11340, None),
                ([5647967.1, 40487.76], 550, None),
                ([21765291.7, 93129.26], 14,  None),
            ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            kwargs = {
                'k': 3, 
                'max_iter': 50,
                'epsilon': 1e-4,
                'normalize': 0,
                'cols': '0,1',
                'initialization': 'Furthest', 
                # 'initialization': 'PlusPlus',
                'destination_key': 'kmeans_dest_key',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            if 1==0:
                h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore")
                h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply")
                h2b.browseJsonHistoryAsUrlLastMatch("KMeans")
                time.sleep(3600)

            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str,range(11)))
    kwargs = {
        'k': 1, 
        'epsilon': 1e-6,
        'cols': cols, 
        'destination_key': 'KMeansModel.hex',
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

    expected = [
        ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], num_rows, None)
    ]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)



    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)
    
    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Example #33
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseKey['destination_key']

            kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            model_key = kmeans['destination_key']
            kmeansResult = h2o_cmd.runInspect(key=model_key)

            ## h2o.nodes[0].kmeans_apply(data_key=parseKey['destination_key'], model_key=model_key, destination_key='a')
            # this is failing for some reason
            ## h2o.nodes[0].kmeans_score(key=parseKey['destination_key'], model_key=model_key)

            clusters = kmeansResult['KMeansModel']['clusters']
            for i,c in enumerate(clusters):
                print "clusters["+str(i)+"]: ", clusters[i]

            ## print h2o.dump_json(kmeans)
            ## print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
Example #34
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        kwargs = {
            'k': 1,
            'epsilon': 1e-6,
            'cols': None,
            'destination_key': 'prostate_k.hex'
        }
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                       timeoutSecs=5,
                                       **kwargs)
        kmeansResult = h2o_cmd.runInspect(key='prostate_k.hex')
        print h2o.dump_json(kmeans)
        print h2o.dump_json(kmeansResult)
        h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
Example #35
0
    def test_B_kmeans_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='put', hex_key=csvFilename+".hex")

        expected = [
            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for trial in range(2):
            kwargs = {'k': 3, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Example #36
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='put', hex_key=csvFilename+".hex")

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117) ,
            ([63.93984962406015], 133, 611.5187969924812) ,
            ([71.55307262569832], 179, 1474.2458100558654) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            kwargs = {'k': 3, 'initialization': 'Furthest', 'cols': 2, 'destination_key': 'prostate_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)

            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'allstate'
        csvFilename = "train_set.csv"
        csvPathname = importFolderPath + "/" + csvFilename
        timeoutSecs = 600
        trialMax = 3
        for trial in range(trialMax):
            trialStart = time.time()
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
            elapsed = time.time() - start
            print 'h2o reported parse time:', parseResult['response']['time']
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {
                'cols': None,
                'initialization': 'Furthest',
                'k': 12
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
Example #38
0
    def test_kmeans_sphere3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")

        for trial in range(10):
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            kwargs = {
                'k': 3, 
                'initialization': 'Furthest',
                'epsilon': 1e-6, 
                'cols': None, 
                'destination_key': 'spheres3.hex', 
                # 'seed': 265211114317615310,
                'seed': 0,
                }

            timeoutSecs = 90
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            expected = [
                ([100, 100, 100], 1000000,   60028168),
                ([200, 200, 200], 2000000,  479913618),
                ([300, 300, 300], 3000000, 1619244994),
            ]
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Example #39
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117),
            ([63.93984962406015], 133, 611.5187969924812),
            ([71.55307262569832], 179, 1474.2458100558654),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'cols': 2,
                'destination_key': 'prostate_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=5,
                                           **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)
    def test_four_billion_rows(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/billions"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 1500

        csvFilenameAll = [
            "four_billion_rows.csv",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()

            # Parse*********************************
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=timeoutSecs,
                                                 pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            value_size_bytes = inspect['value_size_bytes']
            row_size = inspect['row_size']
            print "\n" + csvFilename, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols), \
                "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                "    row_size:", "{:,}".format(row_size)

            expectedRowSize = num_cols * 1  # plus output
            expectedValueSize = expectedRowSize * num_rows
            self.assertEqual(row_size, expectedRowSize,
                msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                (row_size, expectedRowSize))
            self.assertEqual(value_size_bytes, expectedValueSize,
                msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                (value_size_bytes, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(key=parseKey['destination_key'],
                                               timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                num_cols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, num_cols))
            self.assertEqual(4 * 1000000000,
                             num_rows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, num_rows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'max_iter': 20,
                'cols': None,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'x': 0,
                'y': 1,
                'n_folds': 0,
                'case_mode': '=',
                'case': 1
            }
            # one coefficient is checked a little more
            colX = 0

            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        # csvFilename = "covtype20x.data"
        # csvPathname = csvFilename
        csvFilename = "train_set.csv"
        csvPathname = "allstate/" + csvFilename
        # https://s3.amazonaws.com/home-0xdiag-datasets/allstate/train_set.csv
        URI = "s3n://home-0xdiag-datasets/"
        s3nKey = URI + csvPathname

        trialMax = 3

        for trial in range(trialMax):
            trialStart = time.time()
            # since we delete the key, we have to re-import every iteration
            # s3n URI thru HDFS is not typical.
            importHDFSResult = h2o.nodes[0].import_hdfs(URI)
            s3nFullList = importHDFSResult["succeeded"]
            ### print "s3nFullList:", h2o.dump_json(s3nFullList)
            self.assertGreater(len(s3nFullList), 8, "Didn't see more than 8 files in s3n?")
            storeView = h2o.nodes[0].store_view()
            ### print "storeView:", h2o.dump_json(storeView)
            for s in storeView["keys"]:
                print "\nkey:", s["key"]
                if "rows" in s:
                    print "rows:", s["rows"], "value_size_bytes:", s["value_size_bytes"]

            key2 = csvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, "thru HDFS"
            # ec2 is about 400 secs on four m2.4xlarge nodes
            # should be less on more nodes?
            timeoutSecs = 600
            start = time.time()
            parseKey = h2o.nodes[0].parse(
                s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noise=("JStack", None)
            )
            elapsed = time.time() - start
            print s3nKey, "h2o reported parse time:", parseKey["response"]["time"]
            print "parse end on ", s3nKey, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            print "parse result:", parseKey["destination_key"]

            kwargs = {"cols": None, "epsilon": 1e-6, "k": 12}

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(
                parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs
            )
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None, key=kmeans["destination_key"])
            print h2o.dump_json(inspect)

            print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", "Otherwise it would just parse the cached key."
            storeView = h2o.nodes[0].store_view()
            ### print "storeView:", h2o.dump_json(storeView)
            # h2o removes key after parse now
            ### print "Removing", s3nKey
            ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey)
            ### print "removeKeyResult:", h2o.dump_json(removeKeyResult)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.",
Example #42
0
    def test_kmeans_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname2, key2=csvFilename2 + ".hex")

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(10):
            kwargs = {
                'k': CLUSTERS, 
                'initialization': 'Furthest', 
                'cols': cols,
                'destination_key': 'syn_spheres100.hex'
            }
            timeoutSecs = 100
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            # print h2o.dump_json(kmeansResult)

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']

            # the way we create the centers above, if we sort on the sum of xyz
            # we should get the order the same as when they were created.
            # to be safe, we'll sort the centers that were generated too, the same way
            clustersSorted = sorted(clusters, key=sum)
            centersSorted  = sorted(centersList, key=sum)
            ### print clustersSorted

            print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)"
            cf = '{0:6.2f}'
            for c in clustersSorted:
                print ' '.join(map(cf.format,c))

            print "\ngenerated centers (sorted by key=sum)"
            for c in centersSorted:
                print ' '.join(map(cf.format,c))
            
            for i,center in enumerate(centersSorted):
                # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem.
                # Assuming that the difference between adjacent sums of all center values, 
                # is greater than 2x the sum of all max allowed variance on each value, 
                # Then the sums will be unique and non-overlapping with allowed variance.
                # So a sort of the centers, keyed on sum of all values for a center.
                # will create an ordering that can be compared. 
                # sort gen'ed and actual separately.
                # Adjacent center hamming distance check is done during gen above.
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)

                for i, v in enumerate(a):
                    emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct."
                    self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg)

            print "Trial #", trial, "completed"
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        csvFilename = "CAT*"
        URI = "s3n://home-0xdiag-datasets/cats"
        s3nKey = URI + "/" + csvFilename

        trialMax = 1

        for trial in range(trialMax):
            trialStart = time.time()
            # since we delete the key, we have to re-import every iteration
            # s3n URI thru HDFS is not typical.
            importHDFSResult = h2o.nodes[0].import_hdfs(URI)
            s3nFullList = importHDFSResult['succeeded']
            ### print "s3nFullList:", h2o.dump_json(s3nFullList)
            self.assertGreater(len(s3nFullList),1,"Didn't see more than 1 files in s3n?")
            storeView = h2o.nodes[0].store_view()
            ### print "storeView:", h2o.dump_json(storeView)
            for s in storeView['keys']:
                print "\nkey:", s['key']
                if 'rows' in s:
                    print "rows:", s['rows'], "value_size_bytes:", s['value_size_bytes']

            key2 = csvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            # ec2 is about 400 secs on four m2.4xlarge nodes
            # should be less on more nodes?
            timeoutSecs = 600
            start = time.time()
            parseKey = h2o.nodes[0].parse(s3nKey, key2,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noise=('JStack', None))
            elapsed = time.time() - start
            print s3nKey, 'h2o reported parse time:', parseKey['response']['time']
            print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "parse result:", parseKey['destination_key']

            kwargs = {
                'cols': None,
                'epsilon': 1e-6,
                'k': 12
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                  "Otherwise it would just parse the cached key."
            storeView = h2o.nodes[0].store_view()
            # pattern matching problem
            # h2o removes key afte parse now
            ### print "Removing", s3nKey
            ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
Example #44
0
    def test_KMeans_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "/datasets/kmeans_big"
            csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([
                0.0, -113.00566692375459, -89.99595447985321,
                -455.9970643424373, 4732.0, 49791778.0, 36800.0
            ], 248846122, 1308149283316.2988),
            ([
                0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                25654042.00592703, 28304.0
            ], 276924291, 1800760152555.98),
            ([
                0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                31319.99486705394
            ], 235089554, 375419158808.3253),
            ([
                0.0, 10.0, -72.00113070337981, -171.0198611715457,
                4430.00952228909, 37007399.0, 29894.0
            ], 166180630, 525423632323.6474),
            ([
                0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                22865824.99639042, 5335.0
            ], 167234179, 1845362026223.1094),
            ([
                0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                -47537.998050740985
            ], 195420925, 197941282992.43475),
            ([
                0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289,
                1928.0, 39967190.0, 27202.0
            ], 214401768, 11868360232.658035),
            ([
                0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                30712.99115201907
            ], 258853406, 598863991074.3276),
            ([
                0.0, 21.0, 114.01584574295777, 242.99690338815898,
                1674.0029079209912, 33089556.0, 36415.0
            ], 190979054, 1505088759456.314),
            ([
                0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                -48473733.04122273, 47343.0
            ], 87794427, 1124697008162.3955),
            ([
                0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                16716.003410920028
            ], 78226988, 1151439441529.0215),
            ([
                0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                -14930.007919032574
            ], 167273589, 693036940951.0249),
            ([
                0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                11767.998552236539
            ], 148426180, 35942838893.32379),
            ([
                0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                -23336.998167498707
            ], 157533313, 88431531357.62982),
            ([
                0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008,
                2320.0, 46602185.0, 11212.0
            ], 118361306, 1111537045743.7646),
        ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            if FROM_HDFS:
                importFolderResult = h2i.setupImportHdfs(
                    None, importFolderPath)
            else:
                importFolderResult = h2i.setupImportFolder(
                    None, importFolderPath)

            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            key2 = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseKey = h2i.parseImportHdfsFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutsecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs)
            else:
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutsecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            # KMeans ****************************************
            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'cols': None,
                'destination_key': 'junk.hex',
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=timeoutSecs,
                                           benchmarkLogging=benchmarkLogging,
                                           **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=True,
                                                trial=trial)
    def test_B_kmeans_benign(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = "benign.csv"
        key2 = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right?
        parseKey = h2i.parseImportFolderFile(None,
                                             csvFilename,
                                             importFolderPath,
                                             key2=key2,
                                             header=1,
                                             timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([
                24.538961038961038, 2.772727272727273, 46.89032467532467,
                0.1266233766233766, 12.012142857142857, 1.0105194805194804,
                1.5222727272727272, 22.26039690646432, 12.582467532467534,
                0.5275062016635049, 2.9477601050634767, 162.52136363636365,
                41.94558441558441, 1.661883116883117
            ], 77, 46889.32010560476),
            ([
                25.587719298245613, 2.2719298245614037, 45.64035087719298,
                0.35964912280701755, 13.026315789473685, 1.4298245614035088,
                1.3070175438596492, 24.393307707470925, 13.333333333333334,
                0.5244431302976542, 2.7326039818647745, 122.46491228070175,
                40.973684210526315, 1.6754385964912282
            ], 114, 64011.20272144667),
            ([
                30.833333333333332, 2.9166666666666665, 46.833333333333336,
                0.0, 13.083333333333334, 1.4166666666666667,
                1.5833333333333333, 24.298220973782772, 11.666666666666666,
                0.37640449438202245, 3.404494382022472, 224.91666666666666,
                39.75, 1.4166666666666667
            ], 12, 13000.485226507595),
        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for trial in range(2):
            kwargs = {
                'k': 3,
                'epsilon': 1e-6,
                'cols': None,
                'destination_key': 'benign_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310
            }

            # for fvec only?
            kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5})

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=5,
                                           **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)
Example #46
0
    def test_kmeans_sphere5(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        for trial in range(5):
            kwargs = {
                'k': CLUSTERS,
                'epsilon': 1e-6,
                'cols': None,
                'destination_key': 'syn_spheres100.hex'
            }
            timeoutSecs = 30
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']
            clustersSorted = sorted(clusters, key=itemgetter(0))
            ### print clustersSorted

            print "\nh2o result, centers sorted"
            print clustersSorted
            print "\ngenerated centers"
            print centersList
            for i, center in enumerate(centersList):
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b  # h2o result
                aStr = ",".join(map(str, a))
                bStr = ",".join(map(str, b))
                iStr = str(i)
                self.assertAlmostEqual(a[0],
                                       b[0],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " x not correct.")
                self.assertAlmostEqual(a[1],
                                       b[1],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " y not correct.")
                self.assertAlmostEqual(a[2],
                                       b[2],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " z not correct.")

            print "Trial #", trial, "completed"
Example #47
0
    def test_kmeans_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname2, key2=csvFilename2 + ".hex")

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(10):
            kwargs = {
                'k': CLUSTERS, 
                'epsilon': 1e-6, 
                'cols': cols,
                'destination_key': 'syn_spheres100.hex'
            }
            timeoutSecs = 100
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            # print h2o.dump_json(kmeansResult)

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']

            # the way we create the centers above, if we sort on the sum of xyz
            # we should get the order the same as when they were created.
            # to be safe, we'll sort the centers that were generated too, the same way
            clustersSorted = sorted(clusters, key=sum)
            centersSorted  = sorted(centersList, key=sum)
            ### print clustersSorted

            print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)"
            cf = '{0:6.2f}'
            for c in clustersSorted:
                print ' '.join(map(cf.format,c))

            print "\ngenerated centers (sorted by key=sum)"
            for c in centersSorted:
                print ' '.join(map(cf.format,c))
            
            for i,center in enumerate(centersSorted):
                # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem.
                # Assuming that the difference between adjacent sums of all center values, 
                # is greater than 2x the sum of all max allowed variance on each value, 
                # Then the sums will be unique and non-overlapping with allowed variance.
                # So a sort of the centers, keyed on sum of all values for a center.
                # will create an ordering that can be compared. 
                # sort gen'ed and actual separately.
                # Adjacent center hamming distance check is done during gen above.
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)

                for i, v in enumerate(a):
                    emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct."
                    self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg)

            print "Trial #", trial, "completed"
Example #48
0
    def test_KMeans_params_rand2(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 400),
            ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
            ]

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000,
                                                 pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params()
            for trial in range(3):
                randomV = paramDict['k']
                k = random.choice(randomV)

                randomV = paramDict['epsilon']
                epsilon = random.choice(randomV)

                randomV = paramDict['cols']
                cols = random.choice(randomV)

                kwargs = {
                    'k': k,
                    'epsilon': epsilon,
                    'cols': cols,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,
                                             key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_KMeans_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "/datasets/kmeans_big"
            csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            if FROM_HDFS:
                importFolderResult = h2i.setupImportHdfs(None, importFolderPath)
            else:
                importFolderResult = h2i.setupImportFolder(None, importFolderPath)

            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            key2 = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseKey = h2i.parseImportHdfsFile(None, csvFilename, importFolderPath, key2=key2,
                    timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
                    timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # KMeans ****************************************
            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15, 
                'initialization': 'Furthest',
                'epsilon': 1e-6, 
                'cols': None, 
                'destination_key': 'junk.hex', 
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
                }

            if (trial%3)==0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial%3)==1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
Example #50
0
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        # csvFilename = "covtype20x.data"
        # csvPathname = csvFilename
        csvFilename = "CAT*"
        csvPathname = "cats/" + csvFilename
        # https://s3.amazonaws.com/home-0xdiag-datasets/allstate/train_set.csv
        URI = "s3n://home-0xdiag-datasets/"
        s3nKey = URI + csvPathname

        trialMax = 1

        for trial in range(trialMax):
            trialStart = time.time()
            # since we delete the key, we have to re-import every iteration
            # s3n URI thru HDFS is not typical.
            importHDFSResult = h2o.nodes[0].import_hdfs(URI)
            s3nFullList = importHDFSResult['succeeded']
            ### print "s3nFullList:", h2o.dump_json(s3nFullList)
            self.assertGreater(len(s3nFullList), 8,
                               "Didn't see more than 8 files in s3n?")
            storeView = h2o.nodes[0].store_view()
            ### print "storeView:", h2o.dump_json(storeView)
            for s in storeView['keys']:
                print "\nkey:", s['key']
                if 'rows' in s:
                    print "rows:", s['rows'], "value_size_bytes:", s[
                        'value_size_bytes']

            key2 = csvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            # ec2 is about 400 secs on four m2.4xlarge nodes
            # should be less on more nodes?
            timeoutSecs = 600
            start = time.time()
            parseKey = h2o.nodes[0].parse(s3nKey,
                                          key2,
                                          timeoutSecs=timeoutSecs,
                                          retryDelaySecs=10,
                                          pollTimeoutSecs=60,
                                          noise=('JStack', None))
            elapsed = time.time() - start
            print s3nKey, 'h2o reported parse time:', parseKey['response'][
                'time']
            print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "parse result:", parseKey['destination_key']

            kwargs = {'cols': None, 'epsilon': 1e-6, 'k': 12}

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                  "Otherwise it would just parse the cached key."
            storeView = h2o.nodes[0].store_view()
            # pattern matching problem
            # h2o removes key afte parse now
            ### print "Removing", s3nKey
            ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
Example #51
0
    def test_four_billion_rows(self):
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            "four_billion_rows.csv",
            ]
        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local'
                timeoutSecs=timeoutSecs, pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            value_size_bytes = inspect['value_size_bytes']
            row_size = inspect['row_size']
            print "\n" + csvFilename, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols), \
                "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                "    row_size:", "{:,}".format(row_size)

            expectedRowSize = num_cols * 1 # plus output
            expectedValueSize = expectedRowSize * num_rows
            self.assertEqual(row_size, expectedRowSize,
                msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                (row_size, expectedRowSize))
            self.assertEqual(value_size_bytes, expectedValueSize,
                msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                (value_size_bytes, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(2, num_cols,
                msg="generated %s cols (including output).  parsed to %s cols" % (2, num_cols))
            self.assertEqual(4*1000000000, num_rows,
                msg="generated %s rows, parsed to %s rows" % (4*1000000000, num_rows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'max_iter': 20,
                'cols': None,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
                }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1}
            # one coefficient is checked a little more
            colX = 0

            # L2 
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
Example #52
0
    def test_parse_bounds_libsvm(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/libsvm"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("mnist_train.svm", "cM", 30, 1),
            # FIX! fails KMeansScore
            ("tmc2007_train.svm",  "cJ", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            ("colon-cancer.svm",   "cA", 30, 1),
            ("connect4.svm",       "cB", 30, 1),
            ("duke.svm",           "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm",  "cF", 30, 1),
            ("mushrooms.svm",      "cG", 30, 1),
            ("news20.svm",         "cH", 30, 1),

            ("syn_6_1000_10.svm",  "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvPathname, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360)
            print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            # KMEANS******************************************
            for trial in range(2):
                kwargs = {
                    'k': 3, 
                    'epsilon': 1e-6, 
                    # 'cols': 2, 
                    # 'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseKey['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)