Esempio n. 1
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameList = [
            "airlines_88_08_100lines.csv",
        ]

        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
Esempio n. 2
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameList = [
            "airlines_88_08_100lines.csv",
        ]

        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename,
                                               path='/datasets',
                                               timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,
                                       parseKey=parseKey,
                                       timeoutSecs=2000)
Esempio n. 3
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "TEST-poker1000.csv",
            "leads.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            # these can't RF ..output classes not integer?
            # "bestbuy_test.csv",
            # "bestbuy_train.csv",
            "covtype.data",
            "covtype.4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            # "prostate_2g.csv",
            # "prostate_long.csv.gz",
            "prostate_long_1G.csv",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            # "poker_c1s1_testing_refresh.csv",
            # "3G_poker_shuffle",
            # "billion_rows.csv.gz",
            # "poker-hand.1244M.shuffled311M.full.txt",
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
Esempio n. 4
0
    def test_GLM_hdfs_YearPredictionMSD(self):
        if localhost:
            csvFilenameList = [
                'YearPredictionMSD.txt',
                'YearPredictionMSD.txt'
                ]
        else:
            csvFilenameList = [
                'YearPredictionMSD.txt',
                'YearPredictionMSD.txt'
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            h2i.setupImportHdfs()
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=60)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=500, **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
    def test_GLM_hdfs_YearPredictionMSD(self):
        if localhost:
            csvFilenameList = [
                'YearPredictionMSD.txt',
                'YearPredictionMSD.txt'
                ]
        else:
            csvFilenameList = [
                'YearPredictionMSD.txt',
                'YearPredictionMSD.txt'
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            h2i.setupImportHdfs()
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=60)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=500, **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
 def test_hdfs_multi_bad_csv(self):
     print "\nUse the new regex capabilities for selecting hdfs: try *csv* at /datasets"
     # pop open a browser on the cloud
     h2b.browseTheCloud()
     # defaults to /datasets
     h2i.setupImportHdfs()
     parseKey = h2o.nodes[0].parse('*airlines_all*csv', key2='random_csv.hex', 
         exclude=None, header=None, timeoutSecs=600)
     print "*csv* regex to hdfs /datasets", 'parse time:', parseKey['response']['time']
     print "parse result:", parseKey['destination_key']
     sys.stdout.flush() 
Esempio n. 7
0
 def test_hdfs_multi_copies(self):
     print "\nUse the new regex capabilities for selecting hdfs: try *copies* at /datasets"
     print "This should match to a folder with about twenty covtype10x?"
     # pop open a browser on the cloud
     h2b.browseTheCloud()
     # defaults to /datasets
     h2i.setupImportHdfs()
     parseKey = h2i.parseImportHdfsFile(csvFilename='*covtype10x_copies*', key2='copies.hex', 
         exclude=None, header=None, timeoutSecs=600)
     print "*copies* regex to hdfs /datasets", 'parse time:', parseKey['response']['time']
     print "parse result:", parseKey['destination_key']
     sys.stdout.flush() 
Esempio n. 8
0
 def test_hdfs_multi_copies(self):
     print "\nUse the new regex capabilities for selecting hdfs: try *copies* at /datasets"
     print "This should match to a folder with about twenty covtype10x?"
     # pop open a browser on the cloud
     h2b.browseTheCloud()
     # defaults to /datasets
     h2i.setupImportHdfs()
     parseKey = h2o.nodes[0].parse(
         "*covtype10x_copies*", key2="copies.hex", exclude=None, header=None, timeoutSecs=600
     )
     print "*copies* regex to hdfs /datasets", "parse time:", parseKey["response"]["time"]
     print "parse result:", parseKey["destination_key"]
     sys.stdout.flush()
Esempio n. 9
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "allyears2k.csv",
            "billion_rows.csv.gz",
            "covtype.data",
            "covtype.shuffled.data",
            "covtype200x.data",
            "covtype20x.data",
            "kddcup_1999.data.gz",
            "rand_logreg_100000000x70.csv.gz",
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs(
            path='/datasets/standard', 
            schema='maprfs')

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(
                csvFilename=csvFilename, 
                path='/datasets/standard', 
                schema='maprfs', 
                timeoutSecs=1000)

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
Esempio n. 10
0
 def test_hdfs_multi_bad_csv(self):
     print "\nUse the new regex capabilities for selecting hdfs: try *csv* at /datasets"
     # pop open a browser on the cloud
     h2b.browseTheCloud()
     # defaults to /datasets
     h2i.setupImportHdfs()
     parseKey = h2o.nodes[0].parse('*airlines_all*csv',
                                   key2='random_csv.hex',
                                   exclude=None,
                                   header=None,
                                   timeoutSecs=600)
     print "*csv* regex to hdfs /datasets", 'parse time:', parseKey[
         'response']['time']
     print "parse result:", parseKey['destination_key']
     sys.stdout.flush()
Esempio n. 11
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "allyears2k.csv",
            "billion_rows.csv.gz",
            "covtype.data",
            "covtype.shuffled.data",
            "covtype200x.data",
            "covtype20x.data",
            "kddcup_1999.data.gz",
            "rand_logreg_100000000x70.csv.gz",
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs(path='/datasets/standard', schema='maprfs')

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename,
                                               path='/datasets/standard',
                                               schema='maprfs',
                                               timeoutSecs=1000)

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,
                                       parseKey=parseKey,
                                       timeoutSecs=2000)
Esempio n. 12
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "TEST-poker1000.csv",
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs(
            path='/datasets', 
            schema='maprfs')

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(
                csvFilename=csvFilename, 
                path='/datasets', 
                schema='maprfs', 
                timeoutSecs=1000)

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 13
0
 def test_hdfs_multi_copies(self):
     print "\nUse the new regex capabilities for selecting hdfs: try *copies* at /datasets"
     print "This should match to a folder with about twenty covtype10x?"
     # pop open a browser on the cloud
     h2b.browseTheCloud()
     # defaults to /datasets
     h2i.setupImportHdfs()
     parseKey = h2o.nodes[0].parse('*covtype10x_copies*',
                                   key2='copies.hex',
                                   exclude=None,
                                   header=None,
                                   timeoutSecs=600)
     print "*copies* regex to hdfs /datasets", 'parse time:', parseKey[
         'response']['time']
     print "parse result:", parseKey['destination_key']
     sys.stdout.flush()
Esempio n. 14
0
    def test_import_nflx_parse_loop(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(node_count=1, java_heap_GB=tryHeap,
                    use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3')
            else:
                h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap,
                    use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3')

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandbox_ignore_errors = True

            timeoutSecs = 500
            importFolderPath = "/datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                # since we delete the key, we have to re-import every iteration, to get it again
                importHdfsResult = h2i.setupImportHdfs(path=importFolderPath)
                hdfsFullList = importHdfsResult['succeeded']
                for k in hdfsFullList:
                    key = k['key']
                    # just print the first tile
                    if 'nflx' in key and 'file_1.dat.gz' in key: 
                        # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                        print "example file we'll use:", key

                ### print "hdfsFullList:", h2o.dump_json(hdfsFullList)
                # error if none? 
                self.assertGreater(len(hdfsFullList),8,"Didn't see more than 8 files in hdfs?")

                key2 = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                time.sleep(5)
                print "Loading from hdfs:", importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilePattern, path=importFolderPath,
                    key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print hdfsKey, 'parse time:', parseKey['response']['time']
                print "parse result:", parseKey['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Esempio n. 15
0
    def test_hdfs_multi_bad_csv(self):
        print "\nUse the new regex capabilities for selecting hdfs: try *csv* at /datasets"
        # pop open a browser on the cloud
        h2b.browseTheCloud()
        # defaults to /datasets
        h2i.setupImportHdfs()
        # path should default to /datasets

        # One .gz in with non .gz seems to cause a stack trace..so don't match to all (*airlines*).
        # no..maybe it's just the zero length gz file?. No it doesn't show up in the list of keys?
        # drwxr-xr-x   - earl   supergroup            0 2013-07-24 17:55 /datasets/airline.gz
        # -rw-r--r--   3 hduser supergroup  12155501626 2013-02-22 17:13 /datasets/airline_116M.csv
        # -rw-r--r--   3 hduser supergroup  11349125129 2013-05-03 15:45 /datasets/airlines_1988_2008.csv
        # -rw-r--r--   3 hduser supergroup  11349125429 2013-05-01 12:52 /datasets/airlines_1988_2008_shuffled.csv
        # -rw-r--r--   3 hduser supergroup         9936 2013-05-01 11:49 /datasets/airlines_88_08_100lines.csv
        # -rw-r--r--   3 hduser supergroup  12155501626 2013-02-23 15:59 /datasets/airlines_all.csv
        # -rw-r--r--   3 hduser supergroup 133710514626 2013-02-23 15:21 /datasets/airlines_all_11x.csv

        parseKey = h2i.parseImportHdfsFile(csvFilename="airline_116M.csv", key2="random_csv.hex", timeoutSecs=600)
        print "*csv* regex to hdfs /datasets", "parse time:", parseKey["response"]["time"]
        print "parse result:", parseKey["destination_key"]
        sys.stdout.flush()
Esempio n. 16
0
    def test_hdfs_multi_bad_csv(self):
        print "\nUse the new regex capabilities for selecting hdfs: try *csv* at /datasets"
        # pop open a browser on the cloud
        h2b.browseTheCloud()
        # defaults to /datasets
        h2i.setupImportHdfs()
        # path should default to /datasets

# One .gz in with non .gz seems to cause a stack trace..so don't match to all (*airlines*).
# no..maybe it's just the zero length gz file?. No it doesn't show up in the list of keys?
# drwxr-xr-x   - earl   supergroup            0 2013-07-24 17:55 /datasets/airline.gz
# -rw-r--r--   3 hduser supergroup  12155501626 2013-02-22 17:13 /datasets/airline_116M.csv
# -rw-r--r--   3 hduser supergroup  11349125129 2013-05-03 15:45 /datasets/airlines_1988_2008.csv
# -rw-r--r--   3 hduser supergroup  11349125429 2013-05-01 12:52 /datasets/airlines_1988_2008_shuffled.csv
# -rw-r--r--   3 hduser supergroup         9936 2013-05-01 11:49 /datasets/airlines_88_08_100lines.csv
# -rw-r--r--   3 hduser supergroup  12155501626 2013-02-23 15:59 /datasets/airlines_all.csv
# -rw-r--r--   3 hduser supergroup 133710514626 2013-02-23 15:21 /datasets/airlines_all_11x.csv

        parseKey = h2i.parseImportHdfsFile(csvFilename='airline_116M.csv', key2='random_csv.hex', timeoutSecs=600)
        print "*csv* regex to hdfs /datasets", 'parse time:', parseKey['response']['time']
        print "parse result:", parseKey['destination_key']
        sys.stdout.flush() 
Esempio n. 17
0
    def test_KMeans_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "/datasets/kmeans_big"
            csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([
                0.0, -113.00566692375459, -89.99595447985321,
                -455.9970643424373, 4732.0, 49791778.0, 36800.0
            ], 248846122, 1308149283316.2988),
            ([
                0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                25654042.00592703, 28304.0
            ], 276924291, 1800760152555.98),
            ([
                0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                31319.99486705394
            ], 235089554, 375419158808.3253),
            ([
                0.0, 10.0, -72.00113070337981, -171.0198611715457,
                4430.00952228909, 37007399.0, 29894.0
            ], 166180630, 525423632323.6474),
            ([
                0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                22865824.99639042, 5335.0
            ], 167234179, 1845362026223.1094),
            ([
                0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                -47537.998050740985
            ], 195420925, 197941282992.43475),
            ([
                0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289,
                1928.0, 39967190.0, 27202.0
            ], 214401768, 11868360232.658035),
            ([
                0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                30712.99115201907
            ], 258853406, 598863991074.3276),
            ([
                0.0, 21.0, 114.01584574295777, 242.99690338815898,
                1674.0029079209912, 33089556.0, 36415.0
            ], 190979054, 1505088759456.314),
            ([
                0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                -48473733.04122273, 47343.0
            ], 87794427, 1124697008162.3955),
            ([
                0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                16716.003410920028
            ], 78226988, 1151439441529.0215),
            ([
                0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                -14930.007919032574
            ], 167273589, 693036940951.0249),
            ([
                0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                11767.998552236539
            ], 148426180, 35942838893.32379),
            ([
                0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                -23336.998167498707
            ], 157533313, 88431531357.62982),
            ([
                0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008,
                2320.0, 46602185.0, 11212.0
            ], 118361306, 1111537045743.7646),
        ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            if FROM_HDFS:
                importFolderResult = h2i.setupImportHdfs(
                    None, importFolderPath)
            else:
                importFolderResult = h2i.setupImportFolder(
                    None, importFolderPath)

            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            key2 = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseKey = h2i.parseImportHdfsFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutsecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs)
            else:
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutsecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            # KMeans ****************************************
            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'cols': None,
                'destination_key': 'junk.hex',
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=timeoutSecs,
                                           benchmarkLogging=benchmarkLogging,
                                           **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=True,
                                                trial=trial)
Esempio n. 18
0
    def test_B_load_hdfs_and_store_hex_to_hdfs(self):
        print "\nLoad a list of files from 0xdata hdfs, parse, and store the .hex to hdfs"
        print "\nYou can try running as hduser/hduser if fail"

        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "covtype.data",
            "TEST-poker1000.csv",
            "leads.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            "covtype.4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            "covtype.169x.data",
            "prostate_2g.csv",
            "prostate_long.csv.gz",
            "prostate_long_1G.csv",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "poker_c1s1_testing_refresh.csv",
            "3G_poker_shuffle",
            "billion_rows.csv.gz",
            "poker-hand.1244M.shuffled311M.full.txt",
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            print "Storing", parseKey['destination_key'], 'to HDFS'
            ### print "FIX! temporarily disabling since it causes HDFS corruption"
            storeKey = h2o_cmd.runStore2HDFS(key=parseKey['destination_key'], timeoutSecs=1000)


            h2b.browseJsonHistoryAsUrlLastMatch("Parse")

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 19
0
    def test_B_hdfs_files(self):
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "3G_poker_shuffle",
            "TEST-poker1000.csv",
            # corrupt zip file?
            # "allstate_claim_prediction_train_set.zip",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 1000
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            start = time.time()
            print 'Parsing', csvFilename
            parseKey = h2i.parseImportHdfsFile(
                csvFilename=csvFilename, path='/datasets', timeoutSecs=timeoutSecs, retryDelaySecs=1.0)
            print csvFilename, '\nparse time (python)', time.time() - start, 'seconds'
            print csvFilename, '\nparse time (h2o):', parseKey['response']['time']
            ### print h2o.dump_json(parseKey['response'])

            print "parse result:", parseKey['destination_key']
            # I use this if i want the larger set in my localdir
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            ### print h2o.dump_json(inspect)
            cols = inspect['cols']

            # look for nonzero num_missing_values count in each col
            for i, colDict in enumerate(cols):
                num_missing_values = colDict['num_missing_values']
                if num_missing_values != 0:
                    ### print "%s: col: %d, num_missing_values: %d" % (csvFilename, i, num_missing_values)
                    pass

            ### print h2o.dump_json(cols[0])

            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            row_size = inspect['row_size']
            ptype = inspect['type']
            value_size_bytes = inspect['value_size_bytes']
            response = inspect['response']
            ptime = response['time']

            print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \
                   value_size_bytes: %s, response: %s, time: %s" % \
                   (num_cols, num_rows, row_size, ptype, value_size_bytes, response, ptime)

            h2b.browseJsonHistoryAsUrlLastMatch("Inspect")

            print "\n" + csvFilename
#             start = time.time()
#             RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
#             h2b.browseJsonHistoryAsUrlLastMatch("RFView")
#             # wait in case it recomputes it
#             time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
    def test_KMeans_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "/datasets/kmeans_big"
            csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            if FROM_HDFS:
                importFolderResult = h2i.setupImportHdfs(None, importFolderPath)
            else:
                importFolderResult = h2i.setupImportFolder(None, importFolderPath)

            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            key2 = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseKey = h2i.parseImportHdfsFile(None, csvFilename, importFolderPath, key2=key2,
                    timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
                    timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # KMeans ****************************************
            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15, 
                'initialization': 'Furthest',
                'epsilon': 1e-6, 
                'cols': None, 
                'destination_key': 'junk.hex', 
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
                }

            if (trial%3)==0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial%3)==1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
Esempio n. 21
0
    def test_B_load_hdfs_and_store_hex_to_hdfs(self):
        print "\nLoad a list of files from 0xdata hdfs, parse, and store the .hex to hdfs"
        print "\nYou can try running as hduser/hduser if fail"

        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "covtype.data",
            "TEST-poker1000.csv",
            "leads.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            "covtype.4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            "covtype.169x.data",
            "prostate_2g.csv",
            "prostate_long.csv.gz",
            "prostate_long_1G.csv",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "poker_c1s1_testing_refresh.csv",
            "3G_poker_shuffle",
            "billion_rows.csv.gz",
            "poker-hand.1244M.shuffled311M.full.txt",
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename,
                                               path='/datasets',
                                               timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            print "Storing", parseKey['destination_key'], 'to HDFS'
            ### print "FIX! temporarily disabling since it causes HDFS corruption"
            storeKey = h2o_cmd.runStore2HDFS(key=parseKey['destination_key'],
                                             timeoutSecs=1000)

            h2b.browseJsonHistoryAsUrlLastMatch("Parse")

            sys.stdout.write('.')
            sys.stdout.flush()
Esempio n. 22
0
    def test_B_hdfs_files(self):
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "3G_poker_shuffle",
            "TEST-poker1000.csv",
            # corrupt zip file?
            # "allstate_claim_prediction_train_set.zip",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 1000
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            start = time.time()
            print 'Parsing', csvFilename
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename,
                                               path='/datasets',
                                               timeoutSecs=timeoutSecs,
                                               retryDelaySecs=1.0)
            print csvFilename, '\nparse time (python)', time.time(
            ) - start, 'seconds'
            print csvFilename, '\nparse time (h2o):', parseKey['response'][
                'time']
            ### print h2o.dump_json(parseKey['response'])

            print "parse result:", parseKey['destination_key']
            # I use this if i want the larger set in my localdir
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            ### print h2o.dump_json(inspect)
            cols = inspect['cols']

            # look for nonzero num_missing_values count in each col
            for i, colDict in enumerate(cols):
                num_missing_values = colDict['num_missing_values']
                if num_missing_values != 0:
                    ### print "%s: col: %d, num_missing_values: %d" % (csvFilename, i, num_missing_values)
                    pass

            ### print h2o.dump_json(cols[0])

            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            row_size = inspect['row_size']
            ptype = inspect['type']
            value_size_bytes = inspect['value_size_bytes']
            response = inspect['response']
            ptime = response['time']

            print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \
                   value_size_bytes: %s, response: %s, time: %s"                                                                 % \
                   (num_cols, num_rows, row_size, ptype, value_size_bytes, response, ptime)

            # h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            print "\n" + csvFilename
Esempio n. 23
0
    def test_import_nflx_parse_loop(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(node_count=1,
                                java_heap_GB=tryHeap,
                                use_hdfs=True,
                                hdfs_name_node='192.168.1.176',
                                hdfs_version='cdh3')
            else:
                h2o_hosts.build_cloud_with_hosts(
                    node_count=1,
                    java_heap_GB=tryHeap,
                    use_hdfs=True,
                    hdfs_name_node='192.168.1.176',
                    hdfs_version='cdh3')

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandbox_ignore_errors = True

            timeoutSecs = 500
            importFolderPath = "/datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                # since we delete the key, we have to re-import every iteration, to get it again
                importHdfsResult = h2i.setupImportHdfs(path=importFolderPath)
                hdfsFullList = importHdfsResult['succeeded']
                for k in hdfsFullList:
                    key = k['key']
                    # just print the first tile
                    if 'nflx' in key and 'file_1.dat.gz' in key:
                        # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                        print "example file we'll use:", key

                ### print "hdfsFullList:", h2o.dump_json(hdfsFullList)
                # error if none?
                self.assertGreater(len(hdfsFullList), 8,
                                   "Didn't see more than 8 files in hdfs?")

                key2 = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz",

                time.sleep(5)
                print "Loading from hdfs:", importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilePattern,
                                                   path=importFolderPath,
                                                   key2=key2,
                                                   timeoutSecs=timeoutSecs,
                                                   retryDelaySecs=10,
                                                   pollTimeoutSecs=60)
                elapsed = time.time() - start

                print hdfsKey, 'parse time:', parseKey['response']['time']
                print "parse result:", parseKey['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Esempio n. 24
0
    def test_B_hdfs_files(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            "TEST-poker1000.csv",
            "leads.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            # these can't RF ..output classes not integer?
            # "bestbuy_test.csv",
            # "bestbuy_train.csv",
            "covtype.data",
            "covtype.4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            # "prostate_2g.csv",
            # "prostate_long.csv.gz",
            "prostate_long_1G.csv",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            # "poker_c1s1_testing_refresh.csv",
            # "3G_poker_shuffle",
            # "billion_rows.csv.gz",
            # "poker-hand.1244M.shuffled311M.full.txt",
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        timeoutSecs = 200
        # save the first, for all comparisions, to avoid slow drift with each iteration
        firstglm = {}
        h2i.setupImportHdfs()
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename,
                                               path='/datasets',
                                               timeoutSecs=1000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "parse result:", parseKey['destination_key']

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1,
                                       parseKey=parseKey,
                                       timeoutSecs=2000)
            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush()