Example #1
0
    def test_PCA_UCIwine(self):
        csvFilename = "wine.data"
        timeoutSecs = 300
        trialStart = time.time()
        #parse
        trainKey = "wine.hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       schema='local',
                                       hex_key=trainKey,
                                       timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        #PCA params
        params = {
            'destination_key': "python_PCA_key",
            'tolerance': 0.0,
            'standardize': 1
        }

        kwargs = params.copy()
        h2o.beta_features = True
        #TODO(spencer): Hack around no polling FVEC
        PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
        start = time.time()
        h2o_cmd.runPCA(parseResult=parseResult,
                       timeoutSecs=timeoutSecs,
                       noPoll=True,
                       returnFast=False,
                       **kwargs)
        h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                         pollTimeoutSecs=120,
                         retryDelaySecs=2)
        #time.sleep(100)
        elapsed = time.time() - start
        PCAResult['python_elapsed'] = elapsed
        PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs
        print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
              "%f pct. of timeout" % (PCAResult['python_%timeout'])
        #check PCA results
        pcaView = h2o_cmd.runPCAView(modelKey="python_PCA_key")
        h2o_pca.simpleCheckPCA(self, pcaView)
        h2o_pca.resultsCheckPCA(self, pcaView)
Example #2
0
    def test_PCA_UCIwine(self):
        csvFilename = "wine.data"
        timeoutSecs=300
        trialStart = time.time()
        #parse
        trainKey = "wine.hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='local',
            hex_key=trainKey, timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        #PCA params
        params = { 
            'destination_key': "python_PCA_key",
            'tolerance':0.0,
            'standardize':1
            }   

        kwargs = params.copy()
        h2o.beta_features = True
        #TODO(spencer): Hack around no polling FVEC
        PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
        start = time.time()
        h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, returnFast=False, **kwargs)
        h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2)
        #time.sleep(100)
        elapsed = time.time() - start
        PCAResult['python_elapsed']  = elapsed
        PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
        print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
              "%f pct. of timeout" % (PCAResult['python_%timeout'])
        #check PCA results
        pcaView = h2o_cmd.runPCAView(modelKey = "python_PCA_key")
        h2o_pca.simpleCheckPCA(self,pcaView)
        h2o_pca.resultsCheckPCA(self,pcaView)
    def test_PCA_manyfiles_fvec(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'PCAModelKey'
        files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800)
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=hexKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]
            print ignore_x
            ignored_cols = ",".join(map(lambda x: "C" + str(x), ignore_x))
            
            # for comparison
            ignore_x = h2o_glm.goodXFromColumnInfo(378, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)
            print ignore_x


            # PCA(tolerance iterate)****************************************
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'ignored_cols': ignored_cols,
                    'tolerance': tolerance,
                    'standardize': 1,
                    'max_pc': None,
                }

                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult,
                     timeoutSecs=timeoutSecs, **kwargs)
                print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname
                print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout"
                print "Checking PCA results: "
        
                h2o_pca.simpleCheckPCA(self,pcaResult)
                h2o_pca.resultsCheckPCA(self,pcaResult)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                pcaInspect = h2o_cmd.runInspect(key=modelKey)
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["PCAModel"]["stdDev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["PCAModel"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
    def test_PCA_ignore_enums_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100, 3, 'cA', 300), 
            # (10001, 2, 'cA', 300), 
            # (10000, 500, 'cH', 300), 
            # (10000, 1000, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            start = time.time()
            modelKey = 'PCAModelKey'

            # Parse ****************************************
            parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # PCA(tolerance iterate)****************************************
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'ignored_cols': 'C1',
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                PCAResult['python_elapsed']  = elapsed
                PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])            
    
                print "Checking PCA results: "
                pcaView = h2o_cmd.runPCAView(modelKey = modelKey) 
                h2o_pca.simpleCheckPCA(self,pcaView)
                h2o_pca.resultsCheckPCA(self,pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
    def test_PCA_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']

        if localhost:
            tryList = [
                (10000, 100, 'cA', 300), 
                (10000, 500, 'cH', 300), 
                (10000, 1000, 'cI', 300), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (10000, 100, 'cD', 300), 
                (10000, 200, 'cE', 300), 
                (10000, 300, 'cF', 300), 
                (10000, 400, 'cG', 300), 
                (10000, 500, 'cH', 300), 
                (10000, 1000, 'cI', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE ****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            modelKey = 'PCAModelKey'

            # Parse ****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseResult['destination_key'] for h2o"
                parseResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # PCA(tolerance iterate)****************************************
            #h2o.beta_features = True
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                #h2o.beta_features = True
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult, noPoll = True,
                     timeoutSecs=timeoutSecs, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2)
                elapsed = time.time() - start
                PCAResult['python_elapsed']  = elapsed
                PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])            
    
                print "Checking PCA results: "
                pcaView = h2o_cmd.runPCAView(modelKey = modelKey) 
                h2o_pca.simpleCheckPCA(self,pcaView)
                h2o_pca.resultsCheckPCA(self,pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                #h2o.beta_features = True
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
Example #6
0
    def test_PCA_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 10, 'cA', 300),
            (10000, 50, 'cB', 300),
            (10000, 100, 'cC', 300),
            # (10000, 500, 'cH', 300),
            # (10000, 1000, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print(rowCount, colCount, hex_key, timeoutSecs)
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            h2o.beta_features = False  #turn off beta_features
            start = time.time()

            #h2o.beta_features = False
            modelKey = 'PCAModelKey'
            scoreKey = 'PCAScoreKey'

            # Parse ****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseResult = h2i.import_parse(bucket=None,
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           noPoll=h2o.beta_features,
                                           doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseResult['destination_key'] for h2o"
                parseResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename,
                elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # PCA(tolerance iterate)****************************************
            h2o.beta_features = True
            for tolerance in [i / 10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                kwargs = params.copy()
                h2o.beta_features = True
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                h2o_cmd.runPCA(parseResult=parseResult,
                               timeoutSecs=timeoutSecs,
                               noPoll=True,
                               **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300,
                                 pollTimeoutSecs=120,
                                 retryDelaySecs=2)
                elapsed = time.time() - start
                PCAResult['python_elapsed'] = elapsed
                PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])

                pcaView = h2o_cmd.runPCAView(modelKey=modelKey)
                h2o_pca.simpleCheckPCA(self, pcaView)
                h2o_pca.resultsCheckPCA(self, pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                    csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)

                #h2o.beta_features = True
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"]
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
                #h2o.beta_features=False
                print
                print
                print
                num_pc = pcaInspect['pca_model']['num_pc']
                print "The number of standard deviations obtained: ", num_pc
                print
                print
                print

                if DO_PCA_SCORE:
                    # just score with same data
                    score_params = {
                        'destination_key': scoreKey,
                        'model': modelKey,
                        'num_pc': num_pc,
                        'source': hex_key,
                    }
                    kwargs = score_params.copy()
                    pcaScoreResult = h2o.nodes[0].pca_score(
                        timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                    h2j.pollWaitJobs(timeoutSecs=300,
                                     pollTimeoutSecs=120,
                                     retryDelaySecs=2)
                    print "PCAScore completed in", pcaScoreResult[
                        'python_elapsed'], "seconds. On dataset: ", csvPathname
                    print "Elapsed time was ", pcaScoreResult[
                        'python_%timeout'], "% of the timeout"

                    # Logging to a benchmark file
                    algo = "PCAScore " + " num_pc=" + str(
                        score_params['num_pc'])
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                        csvFilename, pcaScoreResult['python_elapsed'])
                    print l
                    h2o.cloudPerfH2O.message(l)
Example #7
0
    def test_PCA_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 10, 'cA', 300), 
            (10000, 50, 'cB', 300), 
            (10000, 100, 'cC', 300), 
            # (10000, 500, 'cH', 300), 
            # (10000, 1000, 'cI', 300), 
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print (rowCount, colCount, hex_key, timeoutSecs)
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            start = time.time()

            modelKey = 'PCAModelKey'
            scoreKey = 'PCAScoreKey'

            # Parse ****************************************
            parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # PCA(tolerance iterate)****************************************
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                kwargs = params.copy()
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2)
                elapsed = time.time() - start
                PCAResult['python_elapsed']  = elapsed
                PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])
                
                pcaView = h2o_cmd.runPCAView(modelKey=modelKey)
                h2o_pca.simpleCheckPCA(self,pcaView)
                h2o_pca.resultsCheckPCA(self,pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)

                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
                print
                print
                print 
                num_pc = pcaInspect['pca_model']['num_pc']
                print "The number of standard deviations obtained: ", num_pc
                print 
                print
                print


                if DO_PCA_SCORE:
                    # just score with same data
                    score_params = {
                        'destination_key': scoreKey,
                        'model': modelKey,
                        'num_pc': num_pc,
                        'source':  hex_key,
                    }
                    kwargs = score_params.copy()
                    pcaScoreResult = h2o.nodes[0].pca_score(timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                    h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2)
                    print "PCAScore completed in", pcaScoreResult['python_elapsed'], "seconds. On dataset: ", csvPathname
                    print "Elapsed time was ", pcaScoreResult['python_%timeout'], "% of the timeout"

                    # Logging to a benchmark file
                    algo = "PCAScore " + " num_pc=" + str(score_params['num_pc'])
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaScoreResult['python_elapsed'])
                    print l
                    h2o.cloudPerfH2O.message(l)
    def test_PCA_many_cols_enum_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            "a",
            "b",
            "c",
            "d",
            "e",
            "f",
            "g",
            "h",
            "i",
            "j",
            "k",
            "l",
            "m",
            "n",
            "o",
            "p",
            "q",
            "r",
            "s",
            "t",
            "u",
        ]

        if localhost:
            tryList = [
                (1001, 2, "cA", 300),
                # (10001, 2, 'cA', 300),
                # (10000, 500, 'cH', 300),
                # (10000, 1000, 'cI', 300),
            ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300),
                # (10000, 50, 'cC', 300),
                (10000, 100, "cD", 300),
                (10000, 200, "cE", 300),
                (10000, 300, "cF", 300),
                (10000, 400, "cG", 300),
                (10000, 500, "cH", 300),
                (10000, 1000, "cI", 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE ****************************************
            start = time.time()
            modelKey = "PCAModelKey"

            # Parse ****************************************
            parseResult = h2i.import_parse(
                bucket=None, path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False
            )

            elapsed = time.time() - start
            print "parse end on ", csvPathname, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            # Logging to a benchmark file
            algo = "Parse"
            l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed
            )
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"])
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # PCA(tolerance iterate)****************************************
            for tolerance in [i / 10.0 for i in range(11)]:
                params = {"ignored_cols": "C1", "destination_key": modelKey, "tolerance": tolerance, "standardize": 1}
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                PCAResult = {"python_elapsed": 0, "python_%timeout": 0}
                start = time.time()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                PCAResult["python_elapsed"] = elapsed
                PCAResult["python_%timeout"] = 1.0 * elapsed / timeoutSecs
                print "PCA completed in", PCAResult["python_elapsed"], "seconds.", "%f pct. of timeout" % (
                    PCAResult["python_%timeout"]
                )

                print "Checking PCA results: "
                pcaView = h2o_cmd.runPCAView(modelKey=modelKey)
                h2o_pca.simpleCheckPCA(self, pcaView)
                h2o_pca.resultsCheckPCA(self, pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult["python_elapsed"]
                )
                print l
                h2o.cloudPerfH2O.message(l)
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"]
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
Example #9
0
    def test_PCA_manyfiles(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800)
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=hexKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)

            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseResult['destination_key'] for h2o"
                parseResult['destination_key'] = hexKey

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # PCA(tolerance iterate)****************************************
            #h2o.beta_features = True
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'ignore': 0,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                #h2o.beta_features = True

                pcaResult = h2o_cmd.runPCA(parseResult=parseResult,
                     timeoutSecs=timeoutSecs, **kwargs)
                print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname
                print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout"
                print "Checking PCA results: "
        
                h2o_pca.simpleCheckPCA(self,pcaResult)
                h2o_pca.resultsCheckPCA(self,pcaResult)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                #h2o.beta_features = True
                pcaInspect = h2o_cmd.runInspect(key=modelKey)
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["PCAModel"]["stdDev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["PCAModel"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print