Example #1
0
    def test_KMeans_covtype_cols_fvec(self):
        h2o.beta_features = True
        # just do the import folder once
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("covtype.binary.svm", "cC", 30, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        importFolderPath = "libsvm"
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # KMEANS******************************************
            for trial in range(1):
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'ignored_cols': range(11, numCols),
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                for trial2 in range(3):
                    timeoutSecs = 600
                    start = time.time()
                    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                    elapsed = time.time() - start
                    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                    # this does an inspect of the model and prints the clusters
                    h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str,range(11)))
    kwargs = {
        'k': 1, 
        'epsilon': 1e-6,
        'cols': cols, 
        'destination_key': 'KMeansModel.hex'
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
    h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    ### print h2o.dump_json(inspect)


    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)
    
    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Example #3
0
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                 key2=csvFilename + ".hex",
                                 timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str, range(11)))
    kwargs = {
        'k': 1,
        'epsilon': 1e-6,
        'cols': cols,
        'destination_key': 'KMeansModel.hex'
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
    h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    ### print h2o.dump_json(inspect)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Example #4
0
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10

            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE,
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols':
                    None,  #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Example #5
0
    def test_KMeans2_winesPCA(self):
        h2o.beta_features = True
        csvPathname = 'winesPCA.csv'
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10)
        print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o.check_sandbox_for_errors()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        kwargs = {
            'initialization': 'Furthest',
            # 'initialization': '',
            # 'initialization': 'PlusPlus',
            'max_iter': 50,
            'k': 3,
            'seed': '265211114317615310',
        }

        timeoutSecs = 480

        # try the same thing 5 times
        for trial in range (10):
            start = time.time()

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = \
                h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster

            # now compare expected vs actual. By sorting on center, we should be able to compare
            # since the centers should be separated enough to have the order be consistent
            if OLD_KMEANS:
                expected = [
                    ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794),
                    ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745),
                    ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474),
                    ]
            else:
                # error:  258.051462872
                expected = [
                    ([-2.23406681758209, -0.7729819755373136], 67, 96.85372611195429),
                    ([0.25174392601612905, 1.792222172419355], 62, 99.21823733913352),
                    ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474),
                        ]

            # multipliers on the expected values for allowed
            # within 2% of best with random seeds?
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
    def test_kmeans_sphere5(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        for trial in range(5):
            # pass SEED so it's repeatable
            kwargs = {
                'k': CLUSTERS, 
                'max_iter': 10,
                'initialization': 'Furthest', 
                'cols': None, 
                'destination_key': 'syn_spheres100.hex', 
                'seed': SEED
            }
            timeoutSecs = 30
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']
            clustersSorted = sorted(clusters, key=itemgetter(0))
            ### print clustersSorted

            print "\nh2o result, centers sorted"
            print clustersSorted
            print "\ngenerated centers"
            print centersList
            for i,center in enumerate(centersList):
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)
                self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" x not correct.")
                self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" y not correct.")
                self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" z not correct.")

            print "Trial #", trial, "completed"
Example #7
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        kwargs = {'k': 1, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'}
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
        h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
    def test_KMeans_params_rand2(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 400),
                ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params()
            for trial in range(3):
                randomV = paramDict['k']
                k = random.choice(randomV)

                randomV = paramDict['epsilon']
                epsilon = random.choice(randomV)

                randomV = paramDict['cols']
                cols = random.choice(randomV)

                kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'}
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Example #9
0
    def test_KMeans_winesPCA(self):
        csvPathname = 'winesPCA.csv'
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put',
                                       timeoutSecs=10)
        print "parse end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        h2o.check_sandbox_for_errors()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        kwargs = {
            #appears not to take 'cols'?
            'cols': None,
            'initialization': 'Furthest',
            'k': 3,
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            'seed': 265211114317615310,
        }

        timeoutSecs = 480

        # try the same thing 5 times
        for trial in range(10):
            start = time.time()

            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = \
                h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster

            # now compare expected vs actual. By sorting on center, we should be able to compare
            # since the centers should be separated enough to have the order be consistent
            expected = [
                ([-2.25977535371875,
                  -0.8631572635625001], 64, 83.77800617624794),
                ([0.16232721958461543,
                  1.7626161107230771], 65, 111.64440134649745),
                ([2.7362112930204074,
                  -1.2107751495102044], 49, 62.6290553489474),
            ]
            # multipliers on the expected values for allowed
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList,
                                                expected, allowedDelta, trial)
Example #10
0
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10
        
            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Example #11
0
    def test_KMeans_covtype20x_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 1200, 'cA'),
            ]
        else:
            # None is okay for hex_key
            csvFilenameList = [
                ('covtype20x.data', 1200, 'cA'),
                # ('covtype200x.data', 1000,'cE'),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs, hex_key in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()
            parseResult = h2i.import_parse(
                bucket='home-0xdiag-datasets',
                path=csvPathname,
                timeoutSecs=2000,
                hex_key=hex_key)  # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            k = 2
            kwargs = {
                'max_iter': 25,
                'initialization': 'Furthest',
                'k': k,
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310,
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Example #12
0
    def test_KMeans_winesPCA(self):
        if localhost:
            csvFilenameList = [
                #with winesPCA2.csv speciy cols = "1,2"
                ('winesPCA.csv', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('winesPCA.csv', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        importFolderPath = os.path.abspath(h2o.find_file('smalldata'))
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, 'winesPCA.csv', importFolderPath, 
                timeoutSecs=2000, key2=key2) # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
		#appears not to take 'cols'?
                'cols': None,
                'epsilon': 1e-6,
                'k': 3
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
	    print "Expected centers: [-2.276318, -0.965151], with 59 rows."
	    print "                  [0.0388763, 1.63886039], with 71 rows."
	    print "		     [2.740469, -1.237816], with 48 rows."
	    model_key = kmeans['destination_key']
	    kmeansScoreResult = h2o.nodes[0].kmeans_score(
	    	key = parseKey['destination_key'], model_key = model_key)
	    score  = kmeansScoreResult['score']
Example #13
0
    def test_KMeans_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 800),
            ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000,
                                                 pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'k': 1,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,
                                             key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Example #14
0
    def test_KMeansGrid_basic(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path='standard/covtype.data', schema='local',
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "python_source:", parseResult['python_source']
            csvPathname = parseResult['python_source']
            
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            destination_key = 'd.hex'
            params = {
                'k': 2, 
                # 'initialization': 'Furthest', 
                'initialization': None,
                'seed': 3923021996079663354, 
                'normalize': 0, 
                'max_iter': '2',
                'destination_key': destination_key
            }
    
            for trial in range(3):
                kwargs = params.copy()
                h2o.beta_features = True
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans (with grid) end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                # This doesn't work (inspecting the model)
                # inspect = h2o_cmd.runInspect(None,key=destination_key)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Example #15
0
    def test_KMeans_covtype20x_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 1200, 'cA'),
                ]
        else:
            # None is okay for hex_key
            csvFilenameList = [
                ('covtype20x.data', 1200,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs, hex_key in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                timeoutSecs=2000, hex_key=hex_key) # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            k = 2
            kwargs = {
                'max_iter': 25,
                'initialization': 'Furthest',
                'k': k, 
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310,
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            gs = h2o.nodes[0].gap_statistic(source=hex_key, k_max=8)
            print "gap_statistic:", h2o.dump_json(gs)
Example #16
0
    def test_GLM_covtype20x(self):
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('covtype20x.data', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=2000, key2=key2, noise=('JStack', None))
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
                'cols': None,
                'epsilon': 1e-4,
                'k': 2
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)
Example #17
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        # loop, to see if we get same centers
        for i in range(2):
            kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            model_key = kmeans['destination_key']
            kmeansResult = h2o_cmd.runInspect(key=model_key)
            centers = kmeansResult['KMeansModel']['clusters']
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            show_results(csvPathname, parseKey, model_key, centers, 'd')
Example #18
0
    def test_kmeans_benign(self):
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, 
            timeoutSecs=180, doSummary=False)
        numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

        inspectResult = h2o_cmd.runInspect(key=parse_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

        expected = [
            ([8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), 
            ([33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), 
            ([27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), 
            ([26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), 
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        # no cols ignored
        labelListUsed = list(labelList)
        numColsUsed = numCols
        for trial in range(5):
            kmeansSeed = random.randint(0, sys.maxint)
            # kmeansSeed = 6655548259421773879
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                'score_each_iteration': False,
                'K': 4, 
                'max_iters': 50,
                'normalize': False,
                'seed': kmeansSeed,
                'init': 'PlusPlus',
            }

            model_key = 'benign_k.hex'
            kmeansResult = h2o.n0.build_model(
                algo='kmeans', 
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters, 
                timeoutSecs=10) 

            modelResult = h2o.n0.models(key=model_key)

            # this prints too
            tuplesSorted, iters, mse, names = \
                h2o_kmeans.simpleCheckKMeans(self, modelResult, parameters, numRows, numColsUsed, labelListUsed)
            
            h2o_cmd.runStoreView()

            # zip with * is it's own inverse here. It's sorted by centers for easy comparisons
            ids, mses, rows, clusters = zip(*tuplesSorted)
    def test_KMeans_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'max_iter': 20,
                    'k': 1,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
Example #20
0
    def test_kmeans_sphere3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        kwargs = {
            'k': 3,
            'epsilon': 1e-6,
            'cols': None,
            'destination_key': 'spheres3.hex'
        }
        timeoutSecs = 30
        start = time.time()
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
        elapsed = time.time() - start
        print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
            (elapsed / timeoutSecs) * 100)

        kmeansResult = h2o_cmd.runInspect(key='spheres3.hex')

        ### print h2o.dump_json(kmeans)
        print h2o.dump_json(kmeansResult)
        h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

        clusters = kmeansResult['KMeansModel']['clusters']

        # cluster centers can return in any order
        clustersSorted = sorted(clusters, key=itemgetter(0))

        self.assertAlmostEqual(clustersSorted[0][0], 100, delta=.2)
        self.assertAlmostEqual(clustersSorted[1][0], 200, delta=.2)
        self.assertAlmostEqual(clustersSorted[2][0], 300, delta=.2)

        self.assertAlmostEqual(clustersSorted[0][1], 100, delta=.2)
        self.assertAlmostEqual(clustersSorted[1][1], 200, delta=.2)
        self.assertAlmostEqual(clustersSorted[2][1], 300, delta=.2)

        self.assertAlmostEqual(clustersSorted[0][2], 100, delta=.2)
        self.assertAlmostEqual(clustersSorted[1][2], 200, delta=.2)
        self.assertAlmostEqual(clustersSorted[2][2], 300, delta=.2)
Example #21
0
    def test_KMeans_winesPCA(self):
        csvPathname = h2o.find_file('smalldata/winesPCA.csv')
        start = time.time()
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10)
        print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
        h2o.check_sandbox_for_errors()
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        kwargs = {
            #appears not to take 'cols'?
            'cols': None,
            'initialization': 'Furthest',
            'k': 3,
            # reuse the same seed, to get deterministic results (otherwise sometimes fails
            'seed': 265211114317615310,
        }

        timeoutSecs = 480

        # try the same thing 5 times
        for trial in range (10):
            start = time.time()

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = \
                h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster

            # now compare expected vs actual. By sorting on center, we should be able to compare
            # since the centers should be separated enough to have the order be consistent
            expected = [
                ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794) ,
                ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745) ,
                ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474) ,
            ]
            # multipliers on the expected values for allowed
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
    def test_KMeans_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {'k': 1 }
                # 'destination_key': csvFilename + "_" + str(trial) + '.hex'}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeansGridOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_KMeans_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'max_iter': 20, 
                    'k': 1, 
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
Example #24
0
    def test_KMeans_covtype20x(self):
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('covtype20x.data', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=2000, key2=key2) # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
                'cols': None,
                'epsilon': 1e-4,
                'k': 2, 
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310,
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
Example #25
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseKey['destination_key']

            kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            model_key = kmeans['destination_key']
            kmeansResult = h2o_cmd.runInspect(key=model_key)

            ## h2o.nodes[0].kmeans_apply(data_key=parseKey['destination_key'], model_key=model_key, destination_key='a')
            # this is failing for some reason
            ## h2o.nodes[0].kmeans_score(key=parseKey['destination_key'], model_key=model_key)

            clusters = kmeansResult['KMeansModel']['clusters']
            for i,c in enumerate(clusters):
                print "clusters["+str(i)+"]: ", clusters[i]

            ## print h2o.dump_json(kmeans)
            ## print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
Example #26
0
    def test_C_kmeans_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        kwargs = {
            'k': 1,
            'epsilon': 1e-6,
            'cols': None,
            'destination_key': 'prostate_k.hex'
        }
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                       timeoutSecs=5,
                                       **kwargs)
        kmeansResult = h2o_cmd.runInspect(key='prostate_k.hex')
        print h2o.dump_json(kmeans)
        print h2o.dump_json(kmeansResult)
        h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
Example #27
0
    def test_KMeans_covtype_fvec(self):
        h2o.beta_features = True
        csvFilenameList = [
            ('covtype.data', 800),
        ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            for trial in range(3):
                kwargs = {
                    'source': u'covtype.hex',
                    'destination_key': 'covtype.data_2.hex',
                    'initialization': 'Furthest',
                    # 'max_iter': 20,
                    'max_iter': 50,
                    'k': 2,
                }

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
Example #28
0
    def test_kmeans_sphere3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex'}
        timeoutSecs = 30
        start = time.time()
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        elapsed = time.time() - start
        print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        kmeansResult = h2o_cmd.runInspect(key='spheres3.hex')

        ### print h2o.dump_json(kmeans)
        print h2o.dump_json(kmeansResult)
        h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

        clusters = kmeansResult['KMeansModel']['clusters']

        # cluster centers can return in any order
        clustersSorted = sorted(clusters, key=itemgetter(0))

        self.assertAlmostEqual(clustersSorted[0][0],100,delta=.2)
        self.assertAlmostEqual(clustersSorted[1][0],200,delta=.2)
        self.assertAlmostEqual(clustersSorted[2][0],300,delta=.2)

        self.assertAlmostEqual(clustersSorted[0][1],100,delta=.2)
        self.assertAlmostEqual(clustersSorted[1][1],200,delta=.2)
        self.assertAlmostEqual(clustersSorted[2][1],300,delta=.2)

        self.assertAlmostEqual(clustersSorted[0][2],100,delta=.2)
        self.assertAlmostEqual(clustersSorted[1][2],200,delta=.2)
        self.assertAlmostEqual(clustersSorted[2][2],300,delta=.2)
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'allstate'
        csvFilename = "train_set.csv"
        csvPathname = importFolderPath + "/" + csvFilename
        timeoutSecs = 600
        trialMax = 3
        for trial in range(trialMax):
            trialStart = time.time()
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='s3n',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print 'h2o reported parse time:', parseResult['response']['time']
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {'cols': None, 'initialization': 'Furthest', 'k': 12}

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
Example #30
0
    def test_KMeans_covtype_fvec(self):
        h2o.beta_features = True
        csvFilenameList = [
            ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            for trial in range(3):
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols': range(11, inspect['numCols']),
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results
                    'seed': 265211114317615310
                }

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'allstate'
        csvFilename = "train_set.csv"
        csvPathname = importFolderPath + "/" + csvFilename
        timeoutSecs = 600
        trialMax = 3
        for trial in range(trialMax):
            trialStart = time.time()
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
            elapsed = time.time() - start
            print 'h2o reported parse time:', parseResult['response']['time']
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {
                'cols': None,
                'initialization': 'Furthest',
                'k': 12
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
Example #32
0
    def test_KMeans_covtype(self):
        csvFilenameList = [
            ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            for trial in range(3):
                kwargs = {
                    'source_key': u'covtype.hex', 
                    'destination_key': 'covtype.data_2.hex', 
                    'initialization': 'Furthest', 
                    # 'max_iter': 20, 
                    'max_iter': 50, 
                    'k': 2,
                }

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
Example #33
0
    def test_parse_bounds_libsvm(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/libsvm"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("mnist_train.svm", "cM", 30, 1),
            # FIX! fails KMeansScore
            ("tmc2007_train.svm",  "cJ", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            ("colon-cancer.svm",   "cA", 30, 1),
            ("connect4.svm",       "cB", 30, 1),
            ("duke.svm",           "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm",  "cF", 30, 1),
            ("mushrooms.svm",      "cG", 30, 1),
            ("news20.svm",         "cH", 30, 1),

            ("syn_6_1000_10.svm",  "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvPathname, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360)
            print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            # KMEANS******************************************
            for trial in range(2):
                kwargs = {
                    'k': 3, 
                    'epsilon': 1e-6, 
                    # 'cols': 2, 
                    # 'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseKey['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
Example #34
0
    def test_KMeans_create_frame_fvec(self):
        for trial in range(20):

            cfParamDict = define_create_frame_params(SEED)
            # default
            params = {
                'rows': 5,
                'cols': 10
            }
            h2o_util.pickRandParams(cfParamDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)

            # h2o does some strict checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None

            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0


            kwargs = params.copy()
            timeoutSecs = 300
            hex_key = 'temp_%s.hex' % trial
            cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
            inspect = h2o_cmd.runInspect(None, hex_key)
            print "\n%s" % hex_key, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            kmeansParamDict = define_KMeans_params(SEED)

            # default
            params = {
                'max_iter': 20, 
                'k': 1, 
                'destination_key': "KM_" + str(trial) + '.hex'
            }
            h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params)
            kwargs = params.copy()

            start = time.time()
            parseResult = {'destination_key': hex_key }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)

            print "Trial #", trial, "completed\n"
Example #35
0
    def test_KMeans_libsvm_fvec(self):
        h2o.beta_features = True
        # just do the import folder once
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            # FIX! fails KMeansScore
            ("colon-cancer.svm", "cA", 30, 1),
            ("connect4.svm", "cB", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            # multi-label class
            # ("tmc2007_train.svm",  "cJ", 30, 1),
            ("mnist_train.svm", "cM", 30, 1),
            ("duke.svm", "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm", "cF", 120, 1
             ),  #Summary2 is slow with 5001 columns
            ("mushrooms.svm", "cG", 30, 1),
            #        ("news20.svm",         "cH", 120, 1), #Summary2 is very slow - disable for now
            ("syn_6_1000_10.svm", "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        importFolderPath = "libsvm"
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # KMEANS******************************************
            for trial in range(1):
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols':
                    None,  #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60))
            elapsed = time.time() - start
            print 'h2o reported parse time:', parseResult['response']['time']
            print "parse end on ", hex_key, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {
                'cols': None,
                'initialization': 'Furthest',
                'k': 12
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \

if __name__ == '__main__':
    h2o.unit_main()
Example #37
0
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        # csvFilename = "covtype20x.data"
        # csvPathname = csvFilename
        csvFilename = "CAT*"
        csvPathname = "cats/" + csvFilename
        # https://s3.amazonaws.com/home-0xdiag-datasets/allstate/train_set.csv
        URI = "s3n://home-0xdiag-datasets/"
        s3nKey = URI + csvPathname

        trialMax = 1

        for trial in range(trialMax):
            trialStart = time.time()
            # since we delete the key, we have to re-import every iteration
            # s3n URI thru HDFS is not typical.
            importHDFSResult = h2o.nodes[0].import_hdfs(URI)
            s3nFullList = importHDFSResult['succeeded']
            ### print "s3nFullList:", h2o.dump_json(s3nFullList)
            self.assertGreater(len(s3nFullList), 8,
                               "Didn't see more than 8 files in s3n?")
            storeView = h2o.nodes[0].store_view()
            ### print "storeView:", h2o.dump_json(storeView)
            for s in storeView['keys']:
                print "\nkey:", s['key']
                if 'rows' in s:
                    print "rows:", s['rows'], "value_size_bytes:", s[
                        'value_size_bytes']

            key2 = csvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            # ec2 is about 400 secs on four m2.4xlarge nodes
            # should be less on more nodes?
            timeoutSecs = 600
            start = time.time()
            parseKey = h2o.nodes[0].parse(s3nKey,
                                          key2,
                                          timeoutSecs=timeoutSecs,
                                          retryDelaySecs=10,
                                          pollTimeoutSecs=60,
                                          noise=('JStack', None))
            elapsed = time.time() - start
            print s3nKey, 'h2o reported parse time:', parseKey['response'][
                'time']
            print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "parse result:", parseKey['destination_key']

            kwargs = {'cols': None, 'epsilon': 1e-6, 'k': 12}

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                  "Otherwise it would just parse the cached key."
            storeView = h2o.nodes[0].store_view()
            # pattern matching problem
            # h2o removes key afte parse now
            ### print "Removing", s3nKey
            ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
Example #38
0
    def test_kmeans_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex")

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(10):
            kwargs = {
                'k': CLUSTERS, 
                'initialization': 'Furthest', 
                'cols': cols,
                'destination_key': 'syn_spheres100.hex'
            }
            timeoutSecs = 100
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            # print h2o.dump_json(kmeansResult)

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']

            # the way we create the centers above, if we sort on the sum of xyz
            # we should get the order the same as when they were created.
            # to be safe, we'll sort the centers that were generated too, the same way
            clustersSorted = sorted(clusters, key=sum)
            centersSorted  = sorted(centersList, key=sum)
            ### print clustersSorted

            print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)"
            cf = '{0:6.2f}'
            for c in clustersSorted:
                print ' '.join(map(cf.format,c))

            print "\ngenerated centers (sorted by key=sum)"
            for c in centersSorted:
                print ' '.join(map(cf.format,c))
            
            for i,center in enumerate(centersSorted):
                # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem.
                # Assuming that the difference between adjacent sums of all center values, 
                # is greater than 2x the sum of all max allowed variance on each value, 
                # Then the sums will be unique and non-overlapping with allowed variance.
                # So a sort of the centers, keyed on sum of all values for a center.
                # will create an ordering that can be compared. 
                # sort gen'ed and actual separately.
                # Adjacent center hamming distance check is done during gen above.
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)

                for i, v in enumerate(a):
                    emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct."
                    self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg)

            print "Trial #", trial, "completed"
    def test_kmeans2_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex")

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(10):
            kwargs = {
                'k': CLUSTERS, 
                'initialization': 'Furthest', 
                'destination_key': 'syn_spheres100.hex',
                'max_iter': 15,
            }
            timeoutSecs = 100
            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # can't inspect a kmeans2 model
            # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            # print h2o.dump_json(kmeansResult)

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

            # cluster centers can return in any order
            model = kmeansResult['model']
            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]


            # the way we create the centers above, if we sort on the sum of xyz
            # we should get the order the same as when they were created.
            # to be safe, we'll sort the centers that were generated too, the same way
            clustersSorted = sorted(clusters, key=sum)
            centersSorted  = sorted(centersList, key=sum)
            ### print clustersSorted

            print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)"
            cf = '{0:6.2f}'
            for c in clustersSorted:
                print ' '.join(map(cf.format,c))

            print "\ngenerated centers (sorted by key=sum)"
            for c in centersSorted:
                print ' '.join(map(cf.format,c))
            
            for i,center in enumerate(centersSorted):
                # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem.
                # Assuming that the difference between adjacent sums of all center values, 
                # is greater than 2x the sum of all max allowed variance on each value, 
                # Then the sums will be unique and non-overlapping with allowed variance.
                # So a sort of the centers, keyed on sum of all values for a center.
                # will create an ordering that can be compared. 
                # sort gen'ed and actual separately.
                # Adjacent center hamming distance check is done during gen above.
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b # h2o result
                aStr = ",".join(map(str,a))
                bStr = ",".join(map(str,b))
                iStr = str(i)

                for i, v in enumerate(a):
                    emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct."
                    self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg)

            print "Trial #", trial, "completed"
Example #40
0
    def test_KMeans_libsvm_fvec(self):

        # hack this into a function so we can call it before and after kmeans
        # kmeans is changing the last col to enum?? (and changing the data)
        def do_summary_and_inspect():
            # SUMMARY******************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            coltypeList = h2o_cmd.infoFromSummary(summaryResult)

            # INSPECT******************************************
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Now check both inspect and summary
            if csvFilename=='covtype.binary.svm':
                for k in range(55):
                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0))
                    stype = inspect['cols'][k]['type']
                    print k, stype
                    self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int'))

                # summary may report type differently than inspect..check it too!
                # we could check na here too
                for i,c in enumerate(coltypeList):
                    print "column index: %s  column type: %s" % (i, c)
                    # inspect says 'int?"
                    assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)

        # just do the import folder once
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            # FIX! fails KMeansScore
            ("colon-cancer.svm",   "cA", 30, 1),
            ("connect4.svm",       "cB", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            # multi-label class
            # ("tmc2007_train.svm",  "cJ", 30, 1),
            ("mnist_train.svm", "cM", 30, 1),
            ("duke.svm",           "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm",  "cF", 120, 1), #Summary2 is slow with 5001 columns
            ("mushrooms.svm",      "cG", 30, 1),
    #        ("news20.svm",         "cH", 120, 1), #Summary2 is very slow - disable for now

            ("syn_6_1000_10.svm",  "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
        ]

        csvFilenameList = [
            ("covtype.binary.svm", "cC", 30, 1),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        importFolderPath = "libsvm"
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000, doSummary=False)

            do_summary_and_inspect()

            # KMEANS******************************************
            for trial in range(1):
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

                do_summary_and_inspect()

                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                print "hello"
                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

                do_summary_and_inspect()
Example #41
0
    def test_KMeans_covtype_fvec(self):
        csvFilenameList = [
            ('covtype.data', 800),
        ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            for trial in range(2):
                kwargs = {
                    'k': 6,
                    'initialization': 'Furthest',
                    # 'initialization': '',
                    # 'ignored_cols': range(11, inspect['numCols']),
                    # ignore the response
                    'ignored_cols_by_name': 'C55',
                    'max_iter': 100,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results
                    'seed': 265211114317615310
                }

                start = time.time()
                kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

                expected = [
                    ([
                        2781.64184460309, 162.69950733599902,
                        16.545275983574268, 243.73547234768156,
                        50.48239522121315, 942.4480922085701,
                        208.3915356763203, 218.7135425941215,
                        140.10956243018794, 1040.6795741397266,
                        0.22024185323685105, 0.0845245225799837,
                        0.4957505706376572, 0.19948305354550802,
                        0.01635558145683929, 0.033196811983660604,
                        0.026025394050259283, 0.04566180477986607,
                        0.008617572941792261, 0.03547936261257615, 0.0, 0.0,
                        0.006189327591882107, 0.13606268110663236,
                        0.037222303163733886, 0.024007252359445064,
                        0.040891651692487006, 0.003232264365769295,
                        1.6188302332734367e-05, 0.004667627172605076,
                        0.00910861811255187, 9.173371321882807e-05,
                        0.0025415634662392956, 0.008946735089224526,
                        0.0023095311328034363, 0.04957397784361021,
                        0.09252154393235448, 0.03887890610245037, 0.0, 0.0,
                        0.0010792201555156243, 0.004867282901375466,
                        0.08281935473426902, 0.045640220376755754,
                        0.04933654940939677, 0.08426550974265995,
                        0.07772003949945769, 0.001327440791284218,
                        0.0014191745045030462, 0.0, 0.0, 0.009513325670870229,
                        0.010970272880816322, 0.009443176360761713
                    ], 185319, 116283720155.37769),
                    ([
                        2892.8730376693256, 119.94759695676377,
                        11.22516236778623, 189.0301354611245,
                        24.621525329374652, 2631.9842642419744,
                        219.94967526442753, 223.3794395991835,
                        135.71226572647987, 5409.1797365002785,
                        0.883243644460939, 0.11675635553906105, 0.0, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0015587307478196325,
                        0.0, 0.0, 0.0, 0.23410651326776769, 0.0, 0.0, 0.0,
                        0.026498422712933754, 0.0, 0.04152904063833735,
                        0.005158656522545927, 0.0695490814622379, 0.0,
                        0.0634997216552236, 0.05418444980515866,
                        0.010391538318797551, 0.0002969010948227871, 0.0, 0.0,
                        0.0, 0.3677862312117276, 0.07596956763778066, 0.0,
                        0.01109667841900167, 0.005641120801632956, 0.0,
                        0.0018185192057895714, 0.0, 0.0, 0.0021154203006123586,
                        0.018444980515865652, 0.010354425681944703
                    ], 26945, 46932273891.61873),
                    ([
                        3022.020861415003, 137.8546989122598, 13.3449108178427,
                        282.99227296949937, 45.23691263596753,
                        1606.0215197015768, 216.64941537882825,
                        222.64791856054669, 137.40339644525253,
                        2529.4366555907336, 0.4113429046111407,
                        0.08617284724616782, 0.5024842481426914, 0.0, 0.0,
                        0.0052506191028494405, 0.0, 0.014176671577693489, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.018949249239835743,
                        0.029850161436945546, 0.05403435628977148,
                        0.020892761982382997, 0.0, 0.0, 0.0018494718033917432,
                        0.011731607159650168, 0.005979436381304661,
                        0.0047098837027052445, 0.013714303626845553,
                        0.0007601642581737249, 0.047788470580859534,
                        0.10631328171530674, 0.04641704021817498,
                        0.0036519231372057308, 0.011872668568383437, 0.0,
                        0.00034481677690354536, 0.17267483777937995,
                        0.044473527475627724, 0.05637754302372967,
                        0.1292435973793925, 0.11970627880003762,
                        0.0013871038525438075, 0.004858781856368139, 0.0, 0.0,
                        0.03151155136202627, 0.028988119494686687,
                        0.012491771417823892
                    ], 127604, 95229063588.02844),
                    ([
                        3051.365089986695, 168.1268450579292,
                        14.114846831985933, 287.6101588092033,
                        50.702549817536706, 2835.266162979793,
                        209.89460702308608, 226.92302305495684,
                        148.84282479633362, 1461.8985753079312,
                        0.3284728328107128, 0.0006069141527711857,
                        0.670920253036516, 0.0, 0.0, 0.0054700083256172235,
                        0.0, 0.01653452018767653, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.03886584862938554, 0.013250959002170886,
                        0.04277966681969203, 0.05480901656564399, 0.0, 0.0,
                        0.0010426473906581905, 0.0018440853103432178, 0.0,
                        0.0035014278044491476, 0.011671426014830491,
                        0.002435437561761296, 0.044405885511091744,
                        0.10662236712081483, 0.042756323967662366, 0.0,
                        0.007384122192049426, 0.006263665294625696, 0.0,
                        0.14390868276285998, 0.022152366576148275,
                        0.07071327974851968, 0.14799368186805065,
                        0.1011367968938445, 0.009111493242244337,
                        0.006427065258833325, 0.0009259331305098857,
                        0.002318723301612991, 0.03055579330682623,
                        0.041044514818820564, 0.024074261393257027
                    ], 128519, 106432862495.53804),
                    ([
                        3052.088693852026, 149.15056174929376,
                        11.549996765359152, 328.4748452763461,
                        44.2420589567205, 4786.68757682272, 215.8348392383499,
                        226.91413106764713, 143.9780260065124,
                        4192.589071226791, 0.8949819938326181, 0.0,
                        0.10501800616738188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.0022642485929312314, 0.002415198499126647, 0.0,
                        0.00012938563388178466, 0.0, 0.1351648588618377, 0.0,
                        0.0, 0.0, 0.014836219351777974, 0.0, 0.0,
                        0.010674314795247235, 0.03553792077286352, 0.0,
                        0.039290104155435275, 0.09289888512712138,
                        0.03864317598602636, 0.0, 0.0, 0.0, 0.0,
                        0.4371509283419232, 0.08636491061609126,
                        0.0003665926293317232, 0.002717098311517478,
                        0.017100467944709204, 0.0, 0.0028249196730856323, 0.0,
                        0.0, 0.03226015138119164, 0.017316110667845514,
                        0.03204450865805533
                    ], 46373, 77991941653.19676),
                    ([
                        3119.4885286481917, 165.13178470083923,
                        11.672206122079334, 271.2690333876713,
                        39.407851838435064, 4959.81440560285,
                        212.5861709835175, 227.95909557447322,
                        148.6725381875264, 1613.4457676749382,
                        0.9052556903942522, 0.0, 0.09474430960574776, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00037734709895550323,
                        0.0, 0.0, 0.0, 0.008346917828895732,
                        0.0021584254060254783, 0.0, 0.0, 0.0031395278633097865,
                        0.0, 0.0, 0.02815009358208054, 0.012512829801364487,
                        0.0, 0.13355068526233171, 0.11424560767976816,
                        0.008799734347642335, 0.0, 0.0018867354947775161,
                        0.0012226046006158305, 0.0, 0.44056028497252914,
                        0.10774014369377528, 0.0033810300066413087,
                        0.014580691903640641, 0.02313892410795146,
                        0.0002565960272897422, 3.018776791644026e-05, 0.0, 0.0,
                        0.06503954597597053, 0.022625732053371973,
                        0.008256354525146411
                    ], 66252, 74666940350.2879),
                ]

                ### print h2o.dump_json(kmeans)
                predictKey = 'd'
                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeansResult, csvPathname, parseResult, predictKey,
                    **kwargs)
                # all are multipliers of expected tuple value
                allowedDelta = (0.01, 0.01, 0.01)
                # these clusters were sorted compared to the cluster order in training
                h2o_kmeans.showClusterDistribution(self,
                                                   tupleResultList,
                                                   expected,
                                                   trial=trial)
                # why is the expected # of rows not right in KMeans2. That means predictions are wrong
                h2o_kmeans.compareResultsToExpected(self,
                                                    tupleResultList,
                                                    expected,
                                                    allowedDelta,
                                                    allowError=False,
                                                    allowRowError=True,
                                                    trial=trial)

                print "Trial #", trial, "completed\n"
    def test_KMeans_libsvm_fvec(self):

        # hack this into a function so we can call it before and after kmeans
        # kmeans is changing the last col to enum?? (and changing the data)
        def do_summary_and_inspect():
            # SUMMARY******************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            coltypeList = h2o_cmd.infoFromSummary(summaryResult)

            # INSPECT******************************************
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Now check both inspect and summary
            if csvFilename == 'covtype.binary.svm':
                for k in range(55):
                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0,
                                     naCnt,
                                     msg='col %s naCnt %d should be %s' %
                                     (k, naCnt, 0))
                    stype = inspect['cols'][k]['type']
                    print k, stype
                    self.assertEqual('Int',
                                     stype,
                                     msg='col %s type %s should be %s' %
                                     (k, stype, 'Int'))

                # summary may report type differently than inspect..check it too!
                # we could check na here too
                for i, c in enumerate(coltypeList):
                    print "column index: %s  column type: %s" % (i, c)
                    # inspect says 'int?"
                    assert c == 'Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (
                        i, c)

        # just do the import folder once
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            # FIX! fails KMeansScore
            ("colon-cancer.svm", "cA", 30, 1),
            ("connect4.svm", "cB", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            # multi-label class
            # ("tmc2007_train.svm",  "cJ", 30, 1),
            ("mnist_train.svm", "cM", 30, 1),
            ("duke.svm", "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm", "cF", 120, 1
             ),  #Summary2 is slow with 5001 columns
            ("mushrooms.svm", "cG", 30, 1),
            #        ("news20.svm",         "cH", 120, 1), #Summary2 is very slow - disable for now
            ("syn_6_1000_10.svm", "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
        ]

        csvFilenameList = [
            ("covtype.binary.svm", "cC", 30, 1),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        importFolderPath = "libsvm"
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000,
                                           doSummary=False)

            do_summary_and_inspect()

            # KMEANS******************************************
            for trial in range(1):
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols':
                    None,  #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

                do_summary_and_inspect()

                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                print "hello"
                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeans, csvPathname, parseResult, 'd', **kwargs)

                do_summary_and_inspect()
Example #43
0
    def test_kmeans_sphere5(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='put',
                                       hex_key=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        for trial in range(5):
            # pass SEED so it's repeatable
            kwargs = {
                'k': CLUSTERS,
                'max_iter': 10,
                'initialization': 'Furthest',
                'cols': None,
                'destination_key': 'syn_spheres100.hex',
                'seed': SEED
            }
            timeoutSecs = 30
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']
            clustersSorted = sorted(clusters, key=itemgetter(0))
            ### print clustersSorted

            print "\nh2o result, centers sorted"
            print clustersSorted
            print "\ngenerated centers"
            print centersList
            for i, center in enumerate(centersList):
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b  # h2o result
                aStr = ",".join(map(str, a))
                bStr = ",".join(map(str, b))
                iStr = str(i)
                self.assertAlmostEqual(a[0],
                                       b[0],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " x not correct.")
                self.assertAlmostEqual(a[1],
                                       b[1],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " y not correct.")
                self.assertAlmostEqual(a[2],
                                       b[2],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " z not correct.")

            print "Trial #", trial, "completed"
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60))
            elapsed = time.time() - start
            print "parse end on ", hex_key, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {
                'cols': None,
                'initialization': 'Furthest',
                'k': 12
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \

if __name__ == '__main__':
    h2o.unit_main()
Example #45
0
    def test_KMeans2_sphere5_inits(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        expectedCenters = write_spheres_dataset(csvPathname, CLUSTERS,
                                                SPHERE_PTS)

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='put',
                                       hex_key=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        savedResults = []
        Result = collections.namedtuple(
            'Result',
            'trial clusters size cluster_variances error iterations normalized max_iter clustersSorted'
        )

        # save the best for comparison. Print messages when we update best
        sameAsBest = 0
        # big number? to init
        bestResult = Result(None, None, None, None, None, None, None, None,
                            None)
        for trial in range(TRIALS):
            # pass SEED so it's repeatable
            kwargs = {
                'normalize': 0,
                'k': CLUSTERS,
                'max_iter': MAX_ITER,
                'initialization': INIT,
                # 'initialization': 'PlusPlus',
                'destination_key': 'syn_spheres100.hex',
                'seed': SEED
            }

            timeoutSecs = 30
            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # see if we took the full limit to get an answer

            # inspect of model doesn't work
            # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

            model = kmeansResult['model']
            clusters = model["centers"]
            size = model["size"]
            cluster_variances = model["within_cluster_variances"]
            # round to int to avoid fp error when saying "same"
            error = int(model["total_within_SS"])
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            # clustersSorted = sorted(clusters, key=itemgetter(2))
            clustersSorted = sorted(clusters)

            r = Result(
                trial,
                clusters,
                size,
                cluster_variances,
                error,
                iterations,
                normalized,
                max_iter,
                clustersSorted,
            )

            savedResults.append(r)

            if iterations >= (
                    max_iter -
                    1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (iterations, max_iter))

            print "iterations", iterations
            ### print clustersSorted

            # For now, just analyze the one with the lowest error
            # we could analyze how many are not best, and how many are best (maybe just look at error
            print "savedResults, error"
            print r.error
            if bestResult.error and r.error <= bestResult.error:
                sameAsBest += 1
                # we can check that if it has the same error, the sizes should be the same (integer) and reflects centers?
                # should
                if r.size != bestResult.size:
                    raise Exception(
                        "Would expect that if two trials got the same error (rounded to int), the cluster sizes would likely be the same? %s %s"
                        % (r.size, bestResult.size))

            if not bestResult.error:  # init case
                bestResult = r
            elif r.error < bestResult.error:
                print "Trial", r.trial, "has a lower error", r.error, "than current lowest error", bestResult.error
                print "Using it for best now"
                bestResult = r

            print "Trial #", trial, "completed"

        print "\nApparently, %s out of %s trials, got the same best error: %s  (lowest) " % (
            sameAsBest, TRIALS, bestResult.error)
        print "\nh2o best result was from trial %s, centers sorted:" % bestResult.trial
        print bestResult.clustersSorted
        print "\ngenerated centers for comparison"
        print expectedCenters
        for i, center in enumerate(expectedCenters):
            a = center
            bb = bestResult.clustersSorted
            print "bb:", bb
            b = bb[i]
            print "\nexpected:", a
            print "h2o:", b  # h2o result
            aStr = ",".join(map(str, a))
            bStr = ",".join(map(str, b))
            iStr = str(i)
            self.assertAlmostEqual(a[0],
                                   b[0],
                                   delta=1,
                                   msg=aStr + "!=" + bStr +
                                   ". Sorted cluster center " + iStr +
                                   "; x not correct.")
            self.assertAlmostEqual(a[1],
                                   b[1],
                                   delta=1,
                                   msg=aStr + "!=" + bStr +
                                   ". Sorted cluster center " + iStr +
                                   "; y not correct.")
            self.assertAlmostEqual(a[2],
                                   b[2],
                                   delta=1,
                                   msg=aStr + "!=" + bStr +
                                   ". Sorted cluster center " + iStr +
                                   "; z not correct.")
    def test_KMeans2_sphere5_bad_inits(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        expectedCenters = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        savedResults = []
        Result = collections.namedtuple('Result', 
            'trial clusters size cluster_variances error iterations normalized max_iter clustersSorted')

        # save the best for comparison. Print messages when we update best
        sameAsBest = 1
        # big number? to init
        bestResult = Result(None, None, None, None, None, None, None, None, None)
        for trial in range(TRIALS):
            # pass SEED so it's repeatable
            kwargs = {
                'normalize': 0,
                'k': CLUSTERS, 
                'max_iter': MAX_ITER, 
                'initialization': INIT,
                # 'initialization': 'PlusPlus',
                'destination_key': 'syn_spheres100.hex', 
                'seed': SEED
            }

            timeoutSecs = 30
            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # see if we took the full limit to get an answer
            
            # inspect of model doesn't work
            # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')
            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

            model = kmeansResult['model']
            clusters = model["centers"]
            size = model["size"]
            cluster_variances = model["within_cluster_variances"]
            # round to int to avoid fp error when saying "same"
            error = int(model["total_within_SS"])
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            # clustersSorted = sorted(clusters, key=itemgetter(0))
            clustersSorted = sorted(clusters)

            r = Result (
                trial,
                clusters,
                size,
                cluster_variances,
                error,
                iterations,
                normalized,
                max_iter,
                clustersSorted,
            )

            savedResults.append(r)

            if iterations >= (max_iter-1): # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception("KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s", 
                    (iterations, max_iter))

            print "iterations", iterations
            ### print clustersSorted

            # For now, just analyze the one with the lowest error
            # we could analyze how many are not best, and how many are best (maybe just look at error
            print "savedResults, error"
            print r.error
            if bestResult.error and r.error <= bestResult.error:
                sameAsBest += 1
                # we can check that if it has the same error, the sizes should be the same (integer) and reflects centers?
                # should 
                if sorted(r.size)!=sorted(bestResult.size):
                    raise Exception("Would expect that if two trials got the same error (rounded to int), the cluster sizes would likely be the same? %s %s" % 
                        (r.size, bestResult.size))

            if not bestResult.error: # init case
                bestResult = r 
            elif r.error < bestResult.error:
                print "Trial", r.trial, "has a lower error", r.error, "than current lowest error", bestResult.error
                print "Using it for best now"
                bestResult = r

            print "Trial #", trial, "completed"
                
        print "\nApparently, %s out of %s trials, got the same best error: %s  (lowest) " % (sameAsBest, TRIALS, bestResult.error)
        print "\nh2o best result was from trial %s, centers sorted:" % bestResult.trial
        print bestResult.clustersSorted
        print "\ngenerated centers for comparison"
        print expectedCenters
        for i,center in enumerate(expectedCenters):
            a = center
            bb = bestResult.clustersSorted
            print "bb:", bb
            b = bb[i]
            print "\nexpected:", a
            print "h2o:", b # h2o result
            aStr = ",".join(map(str,a))
            bStr = ",".join(map(str,b))
            iStr = str(i)
            self.assertAlmostEqual(a[0], b[0], delta=2, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+"; x not correct.")
            self.assertAlmostEqual(a[1], b[1], delta=2, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+"; y not correct.")
            self.assertAlmostEqual(a[2], b[2], delta=2, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+"; z not correct.")
Example #47
0
    def test_KMeans_covtype_fvec(self):
        csvFilenameList = [
            ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            for trial in range(2):
                kwargs = {
                    'k': 6,
                    'initialization': 'Furthest',
                    # 'initialization': '',
                    # 'ignored_cols': range(11, inspect['numCols']),
                    # ignore the response
                    'ignored_cols_by_name': 'C55',
                    'max_iter': 100,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results
                    'seed': 265211114317615310
                }

                start = time.time()
                kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

                expected = [
                    ([2781.64184460309, 162.69950733599902, 16.545275983574268, 243.73547234768156, 50.48239522121315, 942.4480922085701, 208.3915356763203, 218.7135425941215, 140.10956243018794, 1040.6795741397266, 0.22024185323685105, 0.0845245225799837, 0.4957505706376572, 0.19948305354550802, 0.01635558145683929, 0.033196811983660604, 0.026025394050259283, 0.04566180477986607, 0.008617572941792261, 0.03547936261257615, 0.0, 0.0, 0.006189327591882107, 0.13606268110663236, 0.037222303163733886, 0.024007252359445064, 0.040891651692487006, 0.003232264365769295, 1.6188302332734367e-05, 0.004667627172605076, 0.00910861811255187, 9.173371321882807e-05, 0.0025415634662392956, 0.008946735089224526, 0.0023095311328034363, 0.04957397784361021, 0.09252154393235448, 0.03887890610245037, 0.0, 0.0, 0.0010792201555156243, 0.004867282901375466, 0.08281935473426902, 0.045640220376755754, 0.04933654940939677, 0.08426550974265995, 0.07772003949945769, 0.001327440791284218, 0.0014191745045030462, 0.0, 0.0, 0.009513325670870229, 0.010970272880816322, 0.009443176360761713], 185319, 116283720155.37769) ,

                    ([2892.8730376693256, 119.94759695676377, 11.22516236778623, 189.0301354611245, 24.621525329374652, 2631.9842642419744, 219.94967526442753, 223.3794395991835, 135.71226572647987, 5409.1797365002785, 0.883243644460939, 0.11675635553906105, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0015587307478196325, 0.0, 0.0, 0.0, 0.23410651326776769, 0.0, 0.0, 0.0, 0.026498422712933754, 0.0, 0.04152904063833735, 0.005158656522545927, 0.0695490814622379, 0.0, 0.0634997216552236, 0.05418444980515866, 0.010391538318797551, 0.0002969010948227871, 0.0, 0.0, 0.0, 0.3677862312117276, 0.07596956763778066, 0.0, 0.01109667841900167, 0.005641120801632956, 0.0, 0.0018185192057895714, 0.0, 0.0, 0.0021154203006123586, 0.018444980515865652, 0.010354425681944703], 26945, 46932273891.61873) ,

                    ([3022.020861415003, 137.8546989122598, 13.3449108178427, 282.99227296949937, 45.23691263596753, 1606.0215197015768, 216.64941537882825, 222.64791856054669, 137.40339644525253, 2529.4366555907336, 0.4113429046111407, 0.08617284724616782, 0.5024842481426914, 0.0, 0.0, 0.0052506191028494405, 0.0, 0.014176671577693489, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018949249239835743, 0.029850161436945546, 0.05403435628977148, 0.020892761982382997, 0.0, 0.0, 0.0018494718033917432, 0.011731607159650168, 0.005979436381304661, 0.0047098837027052445, 0.013714303626845553, 0.0007601642581737249, 0.047788470580859534, 0.10631328171530674, 0.04641704021817498, 0.0036519231372057308, 0.011872668568383437, 0.0, 0.00034481677690354536, 0.17267483777937995, 0.044473527475627724, 0.05637754302372967, 0.1292435973793925, 0.11970627880003762, 0.0013871038525438075, 0.004858781856368139, 0.0, 0.0, 0.03151155136202627, 0.028988119494686687, 0.012491771417823892], 127604, 95229063588.02844) ,

                    ([3051.365089986695, 168.1268450579292, 14.114846831985933, 287.6101588092033, 50.702549817536706, 2835.266162979793, 209.89460702308608, 226.92302305495684, 148.84282479633362, 1461.8985753079312, 0.3284728328107128, 0.0006069141527711857, 0.670920253036516, 0.0, 0.0, 0.0054700083256172235, 0.0, 0.01653452018767653, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03886584862938554, 0.013250959002170886, 0.04277966681969203, 0.05480901656564399, 0.0, 0.0, 0.0010426473906581905, 0.0018440853103432178, 0.0, 0.0035014278044491476, 0.011671426014830491, 0.002435437561761296, 0.044405885511091744, 0.10662236712081483, 0.042756323967662366, 0.0, 0.007384122192049426, 0.006263665294625696, 0.0, 0.14390868276285998, 0.022152366576148275, 0.07071327974851968, 0.14799368186805065, 0.1011367968938445, 0.009111493242244337, 0.006427065258833325, 0.0009259331305098857, 0.002318723301612991, 0.03055579330682623, 0.041044514818820564, 0.024074261393257027], 128519, 106432862495.53804) ,

                    ([3052.088693852026, 149.15056174929376, 11.549996765359152, 328.4748452763461, 44.2420589567205, 4786.68757682272, 215.8348392383499, 226.91413106764713, 143.9780260065124, 4192.589071226791, 0.8949819938326181, 0.0, 0.10501800616738188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0022642485929312314, 0.002415198499126647, 0.0, 0.00012938563388178466, 0.0, 0.1351648588618377, 0.0, 0.0, 0.0, 0.014836219351777974, 0.0, 0.0, 0.010674314795247235, 0.03553792077286352, 0.0, 0.039290104155435275, 0.09289888512712138, 0.03864317598602636, 0.0, 0.0, 0.0, 0.0, 0.4371509283419232, 0.08636491061609126, 0.0003665926293317232, 0.002717098311517478, 0.017100467944709204, 0.0, 0.0028249196730856323, 0.0, 0.0, 0.03226015138119164, 0.017316110667845514, 0.03204450865805533], 46373, 77991941653.19676) ,

                    ([3119.4885286481917, 165.13178470083923, 11.672206122079334, 271.2690333876713, 39.407851838435064, 4959.81440560285, 212.5861709835175, 227.95909557447322, 148.6725381875264, 1613.4457676749382, 0.9052556903942522, 0.0, 0.09474430960574776, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00037734709895550323, 0.0, 0.0, 0.0, 0.008346917828895732, 0.0021584254060254783, 0.0, 0.0, 0.0031395278633097865, 0.0, 0.0, 0.02815009358208054, 0.012512829801364487, 0.0, 0.13355068526233171, 0.11424560767976816, 0.008799734347642335, 0.0, 0.0018867354947775161, 0.0012226046006158305, 0.0, 0.44056028497252914, 0.10774014369377528, 0.0033810300066413087, 0.014580691903640641, 0.02313892410795146, 0.0002565960272897422, 3.018776791644026e-05, 0.0, 0.0, 0.06503954597597053, 0.022625732053371973, 0.008256354525146411], 66252, 74666940350.2879) ,
                ]


                ### print h2o.dump_json(kmeans)
                predictKey = 'd'
                (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeansResult, csvPathname, parseResult, predictKey, **kwargs)
                # all are multipliers of expected tuple value
                allowedDelta = (0.01, 0.01, 0.01)
                # these clusters were sorted compared to the cluster order in training
                h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial)
                # why is the expected # of rows not right in KMeans2. That means predictions are wrong
                h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=False,
                    allowRowError=True, trial=trial)

                print "Trial #", trial, "completed\n"
    def test_parse_bounds_libsvm(self):
        # just do the import folder once
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            # FIX! fails KMeansScore
            ("colon-cancer.svm",   "cA", 30, 1),
            ("connect4.svm",       "cB", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            # multi-label class
            # ("tmc2007_train.svm",  "cJ", 30, 1),
            ("mnist_train.svm", "cM", 30, 1),
            ("duke.svm",           "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm",  "cF", 30, 1),
            ("mushrooms.svm",      "cG", 30, 1),
            ("news20.svm",         "cH", 30, 1),

            ("syn_6_1000_10.svm",  "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        importFolderPath = "libsvm"
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000)
            print csvPathname, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            # KMEANS******************************************
            for trial in range(1):
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'cols': range(10),
                    # 'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Example #49
0
    def test_KMeans_params_rand2(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 400),
            ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
            ]

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000,
                                                 pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params()
            for trial in range(3):
                randomV = paramDict['k']
                k = random.choice(randomV)

                randomV = paramDict['epsilon']
                epsilon = random.choice(randomV)

                randomV = paramDict['cols']
                cols = random.choice(randomV)

                kwargs = {
                    'k': k,
                    'epsilon': epsilon,
                    'cols': cols,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,
                                             key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_KMeans_allstate_s3n_thru_hdfs(self):
        csvFilename = "CAT*"
        URI = "s3n://home-0xdiag-datasets/cats"
        s3nKey = URI + "/" + csvFilename

        trialMax = 1

        for trial in range(trialMax):
            trialStart = time.time()
            # since we delete the key, we have to re-import every iteration
            # s3n URI thru HDFS is not typical.
            importHDFSResult = h2o.nodes[0].import_hdfs(URI)
            s3nFullList = importHDFSResult['succeeded']
            ### print "s3nFullList:", h2o.dump_json(s3nFullList)
            self.assertGreater(len(s3nFullList),1,"Didn't see more than 1 files in s3n?")
            storeView = h2o.nodes[0].store_view()
            ### print "storeView:", h2o.dump_json(storeView)
            for s in storeView['keys']:
                print "\nkey:", s['key']
                if 'rows' in s:
                    print "rows:", s['rows'], "value_size_bytes:", s['value_size_bytes']

            key2 = csvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            # ec2 is about 400 secs on four m2.4xlarge nodes
            # should be less on more nodes?
            timeoutSecs = 600
            start = time.time()
            parseKey = h2o.nodes[0].parse(s3nKey, key2,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noise=('JStack', None))
            elapsed = time.time() - start
            print s3nKey, 'h2o reported parse time:', parseKey['response']['time']
            print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "parse result:", parseKey['destination_key']

            kwargs = {
                'cols': None,
                'epsilon': 1e-6,
                'k': 12
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                  "Otherwise it would just parse the cached key."
            storeView = h2o.nodes[0].store_view()
            # pattern matching problem
            # h2o removes key afte parse now
            ### print "Removing", s3nKey
            ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \