Esempio n. 1
0
    def test_big_sum_fail(self):
        node = h2o.nodes[0]
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvPathname = SYNDATASETS_DIR + '/temp.csv'
        hex_key = 'temp.hex'
        for trial in range(5):
            # what about seed?
            cfResult = h2o.nodes[0].create_frame(key=hex_key,
                binary_ones_fraction=0.02, binary_fraction=0, randomize=1, 
                missing_fraction=0, integer_fraction=1, real_range=100,
                has_response=0, response_factors=2, factors=100, cols=1, 
                integer_range=100, value=0, categorical_fraction=0, rows=2.5e+08, 
                timeoutSecs=300)

            inspect = h2o_cmd.runInspect(key=hex_key)
            h2o_cmd.infoFromInspect(inspect, hex_key)

            if UNNECESSARY:
                # this is just doing a head to R. not critical
                h2e.exec_expr(execExpr="%s = %s" % (hex_key, hex_key))
                h2e.exec_expr(execExpr="Last.value.0 = %s[c(1,2,3,4,5,6),]" % hex_key)
                h2e.exec_expr(execExpr="Last.value.0 = Last.value.0")
                node.csv_download(src_key="Last.value.0", csvPathname=csvPathname)
                node.remove_key("Last.value.0")
                # not sure why this happened
                h2o_cmd.runStoreView(view=10000, offset=0)


            # Fails on this
            h2e.exec_expr(execExpr='Last.value.1 = %s[,1]' % hex_key)

            print "Trial #", trial, "completed"
Esempio n. 2
0
    def test_kmeans_benign(self):
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, 
            timeoutSecs=180, doSummary=False)
        numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

        inspectResult = h2o_cmd.runInspect(key=parse_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

        expected = [
            ([8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), 
            ([33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), 
            ([27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), 
            ([26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), 
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        # no cols ignored
        labelListUsed = list(labelList)
        numColsUsed = numCols
        for trial in range(5):
            kmeansSeed = random.randint(0, sys.maxint)
            # kmeansSeed = 6655548259421773879
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                'score_each_iteration': False,
                'K': 4, 
                'max_iters': 50,
                'normalize': False,
                'seed': kmeansSeed,
                'init': 'PlusPlus',
            }

            model_key = 'benign_k.hex'
            kmeansResult = h2o.n0.build_model(
                algo='kmeans', 
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters, 
                timeoutSecs=10) 

            modelResult = h2o.n0.models(key=model_key)

            # this prints too
            tuplesSorted, iters, mse, names = \
                h2o_kmeans.simpleCheckKMeans(self, modelResult, parameters, numRows, numColsUsed, labelListUsed)
            
            h2o_cmd.runStoreView()

            # zip with * is it's own inverse here. It's sorted by centers for easy comparisons
            ids, mses, rows, clusters = zip(*tuplesSorted)
Esempio n. 3
0
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs)
            print h2o.dump_json(importResult)
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30)
            # print h2o.dump_json(storeViewResult)

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_50_nongz_fvec(self):
        avgMichalSize = 237270000
        bucket = "home-0xdiag-datasets"
        importFolderPath = "manyfiles-nflx-gz"
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            ("*[1][0-4][0-9].dat.gz", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern
            hex_key = csvFilename + ".hex"

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            importFullList = importResult["files"]
            importFailList = importResult["fails"]
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            parseResult = h2i.import_parse(
                bucket=bucket, path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=600
            )
            execExpr = "A.hex=%s" % parseResult["destination_key"]
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

            h2o_cmd.runStoreView(timeoutSecs=60)
    def test_parse_nflx_loop_hdfs_fvec(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, use_hdfs=True, hdfs_name_node='mr-0x6', hdfs_version='cdh4')

            timeoutSecs = 500
            importFolderPath = "datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_parse_airline_multi_hdfs(self):
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            timeoutSecs = 3600
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                importResult = h2i.import_only(path=csvPathname, schema='hdfs', 
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                print "importResult:", h2o.dump_json(importResult)

                parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Esempio n. 7
0
    def test_parse_manyfiles_1(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):
            
            csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # import***************************************** 
            hex_key =  csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
                
            # the import has to overwrite existing keys. no parse
            h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "import", trial, "end ", 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the import"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Esempio n. 8
0
    def test_parse_airline_multi_hdfs_many(self):

        # default
        csvFilename = "hex_10"
        csvFilePattern = '*'  # all files in the folder

        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap,
                     random_udp_drop=RANDOM_UDP_DROP,
                     use_hdfs=True,
                     hdfs_name_node=NAME_NODE,
                     hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"
            csvPathname = importFolderPath + "/" + csvFilePattern
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='hdfs',
                                          timeoutSecs=timeoutSecs,
                                          retryDelaySecs=10,
                                          pollTimeoutSecs=60)

            for trial in range(TRIAL_MAX):
                # each parse now just does one
                csvFilePattern = "*%s.csv" % trial
                # if we want multifile
                # csvFilePattern = "*"

                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                # print "Don't wait for completion. Just load things up!"

                print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?"

                print "Drat. We can't re-import the folder, if there's a parse using one of the source files?"
                parseResult = h2i.parse_only(pattern=csvFilePattern,
                                             hex_key=hex_key,
                                             noPoll=True,
                                             delete_on_done=0,
                                             timeoutSecs=timeoutSecs,
                                             retryDelaySecs=10,
                                             pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30)
            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_50_nongz_fvec(self):
        h2o.beta_features = True
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        importFolderPath = 'airlines'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)


            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            h2o_cmd.runStoreView(timeoutSecs=60)
Esempio n. 10
0
    def test_import_nflx_parse_loop(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(node_count=1, java_heap_GB=tryHeap,
                    use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3')
            else:
                h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap,
                    use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3')

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandbox_ignore_errors = True

            timeoutSecs = 500
            importFolderPath = "/datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                # since we delete the key, we have to re-import every iteration, to get it again
                importHdfsResult = h2i.setupImportHdfs(path=importFolderPath)
                hdfsFullList = importHdfsResult['succeeded']
                for k in hdfsFullList:
                    key = k['key']
                    # just print the first tile
                    if 'nflx' in key and 'file_1.dat.gz' in key: 
                        # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                        print "example file we'll use:", key

                ### print "hdfsFullList:", h2o.dump_json(hdfsFullList)
                # error if none? 
                self.assertGreater(len(hdfsFullList),8,"Didn't see more than 8 files in hdfs?")

                key2 = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                time.sleep(5)
                print "Loading from hdfs:", importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilePattern, path=importFolderPath,
                    key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print hdfsKey, 'parse time:', parseKey['response']['time']
                print "parse result:", parseKey['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Esempio n. 11
0
    def test_parse_manyfiles_1(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):
            
            csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # PARSE****************************************
            hex_key =  csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema=SCHEMA, hex_key=hex_key,
                delete_on_done=DELETE_ON_DONE, 
                # importParentDir=IMPORT_PARENT_DIR,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "parse", trial, "end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 542)
            self.assertEqual(numRows, 100000)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            # goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            # convert to binomial
            if DO_EXEC:
                execExpr="A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)

                # execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                # h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)

            if DO_DELETE_MYSELF:
                h2o_import.delete_keys_at_all_nodes()

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_nflx_loop_hdfs_fvec(self):
        h2o.beta_features = True
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz

        # default
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)
            else:
                h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"
            csvPathname = importFolderPath + "/" + csvFilePattern
            parseResult = h2i.import_only(path=csvPathname, schema='hdfs',
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)

            for trial in range(TRIAL_MAX):
                # each parse now just does one
                csvFilePattern = "*%s.csv" % trial
                # if we want multifile
                # csvFilePattern = "*"

                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                # print "Don't wait for completion. Just load things up!"
    
                print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?"
                
                print "Drat. We can't re-import the folder, if there's a parse using one of the source files?"
                parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30)
            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_parse_summary_manyfiles_s3_fvec(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirlist = [("manyfiles-nflx-gz", 800)]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            # change to 50 files
            csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs
            )

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets",
                path=csvPathname,
                schema="s3",
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                retryDelaySecs=10,
                pollTimeoutSecs=120,
            )
            elapsed = time.time() - start
            print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_nflx_loop_hdfs_fvec(self):
        h2o.beta_features = True
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(base_port=55930,
                                java_heap_GB=tryHeap,
                                random_udp_drop=RANDOM_UDP_DROP,
                                use_hdfs=True,
                                hdfs_name_node='mr-0x6',
                                hdfs_version='cdh4')
            else:
                h2o_hosts.build_cloud_with_hosts(
                    node_count=1,
                    java_heap_GB=tryHeap,
                    random_udp_drop=RANDOM_UDP_DROP,
                    use_hdfs=True,
                    hdfs_name_node='mr-0x6',
                    hdfs_version='cdh4')

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz",

                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='hdfs',
                                               hex_key=hex_key,
                                               timeoutSecs=timeoutSecs,
                                               retryDelaySecs=10,
                                               pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Esempio n. 15
0
    def test_parse_summary_airline_s3(self):
        h2o.beta_features = True
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_summary_zip_s3_fvec(self):
        h2o.beta_features = True
        csvFilelist = [
            ("test_set.zip", 300),  # 110.9MB
            ("train_set.zip", 600),  # 362.9MB
        ]

        (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets',
                                                        path="allstate",
                                                        schema='s3')

        print "\nTrying StoreView after the import hdfs"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            csvPathname = "allstate/" + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets',
                                           path=csvPathname,
                                           schema='s3',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
    def test_parse_summary_airline_s3(self):
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Esempio n. 18
0
    def test_parse_summary_manyfiles_1_fvec(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_1.dat.gz"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs)
            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 542)
            self.assertEqual(numRows, 100000)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            # pass numRows, so we know when na cnt means row is all na's
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360, 
                numCols=numCols, numRows=numRows)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Esempio n. 19
0
    def test_parse_manyfiles_1(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):

            if DO_UNCOMPRESSED:
                csvFilename = "a_1.dat"
            else:
                csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # import*****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()

            # the import has to overwrite existing keys. no parse
            h2i.import_only(bucket='home-0xdiag-datasets',
                            path=csvPathname,
                            schema='put',
                            hex_key=hex_key,
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=10,
                            pollTimeoutSecs=120,
                            doSummary=False)
            elapsed = time.time() - start
            print "import", trial, "end ", 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the import"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            # exec does read lock on all existing keys
            if DO_EXEC:
                # fails
                execExpr = "A.hex=c(0,1)"
                # execExpr="A.hex=0;"
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)
                h2o_cmd.runInspect(key='A.hex')

            print "\nTrying StoreView after the exec "
            h2o_cmd.runStoreView(timeoutSecs=30, view=10000)
            # for node in h2o.nodes:
            #    h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
    def test_50_nongz_fvec(self):
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx-gz'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            ("*[1][0-4][0-9].dat.gz", "file_50_A.dat", 50 * avgMichalSize, 1800
             ),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern
            hex_key = csvFilename + ".hex"

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           timeoutSecs=600)
            execExpr = "A.hex=%s" % parseResult['destination_key']
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

            h2o_cmd.runStoreView(timeoutSecs=60)
    def test_50_nongz_fvec(self):
        h2o.beta_features = True
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        importFolderPath = 'airlines'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            h2o_cmd.runStoreView(timeoutSecs=60)
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Esempio n. 24
0
    def test_exec2_covtype_cols(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=30)
        print "\nParse key is:", parseResult['destination_key']

        ### h2b.browseTheCloud()
        start = time.time()
        # passes with suffix, fails without?
        # suffix = ""
        suffix = ".hex"
        for k in range(54):
            # try the funky c(6) thing like  R, instead of just 6
            execExpr = "Result" + str(k) + suffix + " = c.hex[,c(" + str(k+1) + ")]"
            print "execExpr:", execExpr
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result" + str(k) + suffix, 
                timeoutSecs=4)
            for node in h2o.nodes:
                storeView = h2o_cmd.runStoreView(node=node, noPrint=True)
                numKeys = len(storeView['keys'])
                # number of keys should = k + 2? (on each node)
                self.assertEqual(k + 2, numKeys, "# of keys: %s on %s doesn't match expected: %s" % \
                    (numKeys, node, k + 2))
                    # (numKeys, node, k+2, h2o.dump_json(storeView)))

        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
Esempio n. 25
0
def delete_keys(node=None, pattern=None, timeoutSecs=120):
    if not node: node = h2o.nodes[0]
    kwargs = {'filter': pattern}
    deletedCnt = 0
    triedKeys = []
    while True:
        storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=timeoutSecs, **kwargs)
        # we get 20 at a time with default storeView
        keys = storeViewResult['keys']
        if not keys:
            break

        # look for keys we already sent a remove on. Maybe those are locked.
        # give up on those
        deletedThisTime = 0
        for k in keys:
            if k in triedKeys:
                print "Already tried to delete %s. Must have failed. Not trying again" % k
            # don't delete the DRF __Tree__ keys. deleting the model does that. causes race conditions
            elif '__Tree__' in k:
                print "Not deleting a tree key from DRF: %s" % k
            elif 'DRF_' in k:
                print "Not deleting DRF key..they may be problematic in flight: %s" % k
            elif '_distcp_' in k:
                print "Not deleting _distcp_ key..got a timeout before trying: %s" % k
            else:
                node.remove_key(k['key'], timeoutSecs=timeoutSecs)
                deletedCnt += 1
                deletedThisTime += 1
            triedKeys.append(k)
        # print "Deleted", deletedCnt, "keys at %s:%s" % (node.http_addr, node.port)
        if deletedThisTime==0:
            break
    # this is really the count that we attempted. Some could have failed.
    return deletedCnt
Esempio n. 26
0
def delete_keys(node=None, pattern=None, timeoutSecs=120):
    if not node:
        node = h2o.nodes[0]
    kwargs = {"filter": pattern}
    deletedCnt = 0
    triedKeys = []
    while True:
        storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=timeoutSecs, **kwargs)
        # we get 20 at a time with default storeView
        keys = storeViewResult["keys"]
        if not keys:
            break

        # look for keys we already sent a remove on. Maybe those are locked.
        # give up on those
        deletedThisTime = 0
        for k in keys:
            if k in triedKeys:
                print "Already tried to delete %s. Must have failed. Not trying again" % k
            else:
                node.remove_key(k["key"], timeoutSecs=timeoutSecs)
                deletedCnt += 1
                deletedThisTime += 1
            triedKeys.append(k)
        # print "Deleted", deletedCnt, "keys at %s:%s" % (node.http_addr, node.port)
        if deletedThisTime == 0:
            break
    # this is really the count that we attempted. Some could have failed.
    return deletedCnt
Esempio n. 27
0
def delete_keys(node=None, pattern=None, timeoutSecs=120):
    if not node: node = h2o.nodes[0]
    kwargs = {'filter': pattern}
    deletedCnt = 0
    triedKeys = []
    while True:
        storeViewResult = h2o_cmd.runStoreView(node,
                                               timeoutSecs=timeoutSecs,
                                               **kwargs)
        # we get 20 at a time with default storeView
        keys = storeViewResult['keys']
        if not keys:
            break

        # look for keys we already sent a remove on. Maybe those are locked.
        # give up on those
        deletedThisTime = 0
        for k in keys:
            if k in triedKeys:
                print "Already tried to delete %s. Must have failed. Not trying again" % k
            else:
                node.remove_key(k['key'], timeoutSecs=timeoutSecs)
                deletedCnt += 1
                deletedThisTime += 1
            triedKeys.append(k)
        # print "Deleted", deletedCnt, "keys at %s:%s" % (node.http_addr, node.port)
        if deletedThisTime == 0:
            break
    # this is really the count that we attempted. Some could have failed.
    return deletedCnt
Esempio n. 28
0
    def test_exec2_covtype_cols(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key='c.hex',
                                       timeoutSecs=30)
        print "\nParse key is:", parseResult['destination_key']

        ### h2b.browseTheCloud()
        start = time.time()
        # passes with suffix, fails without?
        # suffix = ""
        suffix = ".hex"
        for k in range(54):
            # try the funky c(6) thing like  R, instead of just 6
            execExpr = "Result" + str(k) + suffix + " = c.hex[,c(" + str(
                k + 1) + ")]"
            print "execExpr:", execExpr
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey="Result" + str(k) + suffix,
                          timeoutSecs=4)
            for node in h2o.nodes:
                storeView = h2o_cmd.runStoreView(node=node, noPrint=True)
                numKeys = len(storeView['keys'])
                # number of keys should = k + 2? (on each node)
                self.assertEqual(k + 2, numKeys, "# of keys: %s on %s doesn't match expected: %s" % \
                    (numKeys, node, k + 2))
                # (numKeys, node, k+2, h2o.dump_json(storeView)))

        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data", 'took', time.time(
        ) - start, 'seconds'
    def test_rapids_basic_with_funs_noinc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for i in range(100):
            if i==0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind %v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ %v #1))'
                # execExpr1 = '(+ %v #1)'
                # add to itself?
                execExpr1 = '(+ %v %v)'
                funs = '[(def anon {v} %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon %v2))'
                execExpr2 = '(= !v2 (+ %v2 #1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15)


            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Esempio n. 30
0
    def test_parse_airline_multi_hdfs(self):
        h2o.beta_features = True
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)
            else:
                # why is 55609 already in use?? 
                h2o_hosts.build_cloud_with_hosts(sandbox_ignore_errors=True, force_tcp=True, java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55604, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 3600
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                importResult = h2i.import_only(path=csvPathname, schema='hdfs', 
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                print "importResult:", h2o.dump_json(importResult)

                parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_parse_airline_multi_hdfs(self):
        csvFilename = "hex_10"
        csvFilePattern = '*'  # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap,
                     random_udp_drop=RANDOM_UDP_DROP,
                     disable_assertions=DISABLE_ASSERTIONS,
                     use_hdfs=True,
                     hdfs_name_node=NAME_NODE,
                     hdfs_version=VERSION)

            timeoutSecs = 3600
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                importResult = h2i.import_only(path=csvPathname,
                                               schema='hdfs',
                                               timeoutSecs=timeoutSecs,
                                               retryDelaySecs=10,
                                               pollTimeoutSecs=60)
                print "importResult:", h2o.dump_json(importResult)

                parseResult = h2i.parse_only(pattern='*csv',
                                             hex_key=hex_key,
                                             timeoutSecs=timeoutSecs,
                                             retryDelaySecs=10,
                                             pollTimeoutSecs=120)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Esempio n. 32
0
    def test_parse_manyfiles_1(self):
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):
            
            if DO_UNCOMPRESSED:
                csvFilename = "a_1.dat"
            else:
                csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # import***************************************** 
            hex_key =  csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
                
            # the import has to overwrite existing keys. no parse
            h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "import", trial, "end ", 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the import"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            # exec does read lock on all existing keys
            if DO_EXEC:
                # fails
                execExpr="A.hex=c(0,1)"
                # execExpr="A.hex=0;"
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)
                h2o_cmd.runInspect(key='A.hex')

            print "\nTrying StoreView after the exec "
            h2o_cmd.runStoreView(timeoutSecs=30, view=10000)
            # for node in h2o.nodes:
            #    h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_nflx_loop_hdfs_fvec(self):
        h2o.beta_features = True
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(node_count=1, java_heap_GB=tryHeap,
                    use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3')
            else:
                h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap,
                    use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3')

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                time.sleep(5)
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_rapids_basic_with_funs(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []
        for execExpr1 in initList:
            funs = '[(def anon {x}  (%s);;;)]' % execExpr1
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               funs,
                                               doFuns=True,
                                               resultKey=None,
                                               timeoutSecs=5)
            execExpr2 = '(apply %r1 #2 %anon)'
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr2,
                                               doFuns=False,
                                               resultKey=None,
                                               timeoutSecs=15)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr1)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1 == 0:
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(
                        inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s",
                                        (k, storeView['keys']))
            else:
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Esempio n. 35
0
    def test_storeview_import(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Esempio n. 36
0
    def test_big_sum_fail(self):
        node = h2o.nodes[0]
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvPathname = SYNDATASETS_DIR + '/temp.csv'
        hex_key = 'temp.hex'
        for trial in range(5):
            # what about seed?
            cfResult = h2o.nodes[0].create_frame(key=hex_key,
                                                 binary_ones_fraction=0.02,
                                                 binary_fraction=0,
                                                 randomize=1,
                                                 missing_fraction=0,
                                                 integer_fraction=1,
                                                 real_range=100,
                                                 has_response=0,
                                                 response_factors=2,
                                                 factors=100,
                                                 cols=1,
                                                 integer_range=100,
                                                 value=0,
                                                 categorical_fraction=0,
                                                 rows=2.5e+08,
                                                 timeoutSecs=300)

            inspect = h2o_cmd.runInspect(key=hex_key)
            h2o_cmd.infoFromInspect(inspect, hex_key)

            if UNNECESSARY:
                # this is just doing a head to R. not critical
                h2e.exec_expr(execExpr="%s = %s" % (hex_key, hex_key))
                h2e.exec_expr(execExpr="Last.value.0 = %s[c(1,2,3,4,5,6),]" %
                              hex_key)
                h2e.exec_expr(execExpr="Last.value.0 = Last.value.0")
                node.csv_download(src_key="Last.value.0",
                                  csvPathname=csvPathname)
                node.remove_key("Last.value.0")
                # not sure why this happened
                h2o_cmd.runStoreView(view=10000, offset=0)

            # Fails on this
            h2e.exec_expr(execExpr='Last.value.1 = %s[,1]' % hex_key)

            print "Trial #", trial, "completed"
    def test_parse_summary_zip_s3_fvec(self):
        h2o.beta_features = True
        csvFilelist = [
            ("test_set.zip",   300), # 110.9MB
            ("train_set.zip",  600), # 362.9MB
        ]

        (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3')

        print "\nTrying StoreView after the import hdfs"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            csvPathname = "allstate/" + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Esempio n. 38
0
def delete_keys(node=None, pattern=None, timeoutSecs=30):
    if not node: node = h2o.nodes[0]
    kwargs = {'filter': pattern}
    storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=timeoutSecs, **kwargs)
    keys = storeViewResult['keys']
    for k in keys:
        node.remove_key(k['key'])
    deletedCnt = len(keys)
    print "Deleted", deletedCnt, "keys at", node
    return deletedCnt
Esempio n. 39
0
    def test_parse_with_cancel(self):
        mustWait = 10
        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameList = [
            ("standard", "covtype.data", 54),
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            ("standard", "covtype20x.data", 54),
            ("manyfiles-nflx-gz", "file_[100-109].dat.gz", 378),
            ]

        # just loop on the same file. If remnants exist and are locked, we will blow up? 
        # Maybe try to do an inspect to see if either the source key or parse key exist and cause stack traces
        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            hex_key = csvFilename + ".hex"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=50)

            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key,
                timeoutSecs=500, noPoll=True, doSummary=False)
            job_key = parseResult['job_key']

            # give it a little time to start
            time.sleep(3)
            h2o.nodes[0].jobs_cancel(key=job_key)

            # now wait until the job cancels, and we're idle
            h2o_jobs.pollWaitJobs(timeoutSecs=30)
            elapsed = time.time() - start
            print "Cancelled parse completed in", elapsed, "seconds."

            h2o.check_sandbox_for_errors()
            # get a list of keys from storview. 20 is fine..shouldn't be many, since we putfile, not import folder
            # there maybe a lot since we import the whole "standard" folder
            # find the ones that pattern match the csvFilename, and inspect them. Might be none
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=timeoutSecs, view=100)
            keys = storeViewResult['keys']
            for k in keys:
                keyName = k['key']
                print "kevin:", keyName
                if csvFilename in keyName:
                    h2o_cmd.runInspect(key=keyName)
                    h2o.check_sandbox_for_errors()

            # This will tell h2o to delete using the key name from the import file, whatever pattern matches to csvFilename
            # we shouldn't have to do this..the import/parse should be able to overwrite without deleting.
            # h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

            # If you cancel a parse, you aren't allowed to reparse the same file or import a directory with that file,
            # or cause the key name that the parse would have used, for 5 seconds after the cancel request gets a json
            # response
            print "Waiting", mustWait, "seconds before next reparse-cancel."
            time.sleep(mustWait)
Esempio n. 40
0
    def test_rapids_ddply_with_funs(self):
        if 1==0:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'
        else:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        # get rid of the enum response cole
        execExpr2 = '(= !r2 ([ $r1 "null" {#0;#1;#2;#3}))'
        execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15)

        keys = []
        for execExpr1 in initList:
            # ddply function can only return one row. Just use expressions above as nose
            # some of the expressions above use $v, but v won't be created as key outside any more with ddply
            funs = '[(def anon {v} %s;;(sum $v $TRUE);;;)]' % execExpr1
            execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5)

            execExpr2 = '(h2o.ddply $r2 {#2;#3} $anon)'
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=120)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr1)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Esempio n. 41
0
    def test_rapids_ddply_with_funs(self):
        if 1==0:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'
        else:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        # get rid of the enum response cole
        execExpr2 = '(= !r2 ([ %r1 "null" {#0;#1;#2;#3}))'
        execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15)

        keys = []
        for execExpr1 in initList:
            # ddply function can only return one row. Just use expressions above as nose
            # some of the expressions above use %v, but v won't be created as key outside any more with ddply
            funs = "[(def anon {v} " + "{};;(sum %v %TRUE);;;)]".format(execExpr1)
            execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5)

            execExpr2 = '(= !a h2o.ddply %r2 {#2;#3} %anon)'
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=120)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr1)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_parse_nflx_loop_hdfs_fvec(self):
        h2o.beta_features = True
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)
            else:
                h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_parse_nflx_loop_hdfs_fvec(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap,
                     random_udp_drop=RANDOM_UDP_DROP,
                     use_hdfs=True,
                     hdfs_name_node='mr-0x6',
                     hdfs_version='cdh4')

            timeoutSecs = 500
            importFolderPath = "datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz",

                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='hdfs',
                                               hex_key=hex_key,
                                               timeoutSecs=timeoutSecs,
                                               retryDelaySecs=10,
                                               pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_rapids_basic_with_funs_pick5(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        while initList:
            if len(initList) >= 5:
                pick5 = [initList.pop(0) for i in range(5)]
            else:
                pick5 = initList
                global initList
                initList = []
            pick6 = ['(= !v (c {#1;#4567;(: #9 #90);(: #9 #45);#450})'] + pick5
            execExpr1 = ";;".join(pick6)
            # always do a v assign first, as they may reference %v

            funs = '[(def anon {x}  (%s);;;)]' % execExpr1
            execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5)
            execExpr2 = '(apply %r1 #2 %anon)'
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr1)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Esempio n. 45
0
def delete_keys(node=None, pattern=None, timeoutSecs=90):
    if not node: node = h2o.nodes[0]
    kwargs = {'filter': pattern}
    deletedCnt = 0
    while True:
        storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=timeoutSecs, **kwargs)
        # we get 20 at a time with default storeView
        keys = storeViewResult['keys']
        if not keys:
            break
        for k in keys:
            node.remove_key(k['key'])
        deletedCnt += len(keys)
        # print "Deleted", deletedCnt, "keys at %s:%s" % (node.http_addr, node.port)
    return deletedCnt
Esempio n. 46
0
def delete_keys(node=None, pattern=None, timeoutSecs=120):
    if not node: node = h2o_nodes.nodes[0]

    kwargs = {'filter': pattern}
    deletedCnt = 0
    triedKeys = []
    while True:
        # FIX! h2o is getting a bad store_view NPE stack trace if I grabe all the
        # keys at the end of a test, prior to removing. Just grab 20 at a time like h2o
        # used to do for me. Maybe the keys are changing state, and going slower will eliminate the race
        # against prior work (but note that R might see the same problem
        storeViewResult = h2o_cmd.runStoreView(node,
                                               timeoutSecs=timeoutSecs,
                                               view=20,
                                               **kwargs)
        # we get 20 at a time with default storeView
        keys = storeViewResult['keys']

        if not keys:
            break

        # look for keys we already sent a remove on. Maybe those are locked.
        # give up on those
        deletedThisTime = 0
        for k in keys:
            if k in triedKeys:
                print "Already tried to delete %s. Must have failed. Not trying again" % k
            # don't delete the DRF __Tree__ keys. deleting the model does that. causes race conditions
            elif '__Tree__' in k['key']:
                print "Not deleting a tree key from DRF: %s" % k
            elif 'DRF_' in k['key']:
                print "Not deleting DRF key..they may be problematic in flight: %s" % k
            elif '__RFModel__' in k['key']:
                print "Not deleting __RFModel__ key..seeing NPE's if I try to delete them: %s" % k
            else:
                print "Deleting", k['key'], "at", node
                node.remove_key(k['key'], timeoutSecs=timeoutSecs)
                deletedCnt += 1
                deletedThisTime += 1
            triedKeys.append(k)
        # print "Deleted", deletedCnt, "keys at %s:%s" % (node.http_addr, node.port)
        if deletedThisTime == 0:
            break
    # this is really the count that we attempted. Some could have failed.
    return deletedCnt
Esempio n. 47
0
    def test_parse_with_cancel(self):
        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameList = [
            ("standard", "covtype.data", 54),
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            ("standard", "covtype20x.data", 54),
            ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            ]

        # just loop on the same file. If remnants exist and are locked, we will blow up? 
        # Maybe try to do an inspect to see if either the source key or parse key exist and cause stack traces
        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            hex_key = csvFilename + ".hex"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=50)

            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key,
                timeoutSecs=500, noPoll=True, doSummary=False)
            job_key = parseResult['job_key']

            # give it a little time to start
            time.sleep(3)
            h2o.nodes[0].jobs_cancel(key=job_key)

            # now wait until the job cancels, and we're idle
            h2o_jobs.pollWaitJobs(timeoutSecs=30)
            elapsed = time.time() - start
            print "Cancelled parse completed in", elapsed, "seconds."

            h2o.check_sandbox_for_errors()
            # get a list of keys from storeview. 20 is fine..shouldn't be many, since we putfile, not import folder
            # there maybe a lot since we import the whole "standard" folder
            # find the ones that pattern match the csvFilename, and inspect them. Might be none
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=timeoutSecs, view=100)
            keys = storeViewResult['keys']
            for k in keys:
                keyName = k['key']
                print "kevin:", keyName
                if csvFilename in keyName:
                    h2o_cmd.runInspect(key=keyName)
                    h2o.check_sandbox_for_errors()
Esempio n. 48
0
def count_keys(node=None, pattern=None, timeoutSecs=90):
    if not node: node = h2o.nodes[0]
    kwargs = {'filter': pattern}
    nodeCnt = 0
    offset = 0
    while True:
        # we get 20 at a time with default storeView
        # if we get < 20, we're done
        storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=timeoutSecs, offset=offset, view=20, **kwargs)
        keys = storeViewResult['keys']
        if not keys:
            break
        nodeCnt += len(storeViewResult['keys'])
        if len(keys) < 20:
            break
        offset += 20

    print nodeCnt, "keys at %s:%s" % (node.http_addr, node.port)
    return nodeCnt
Esempio n. 49
0
def count_keys(node=None, pattern=None, timeoutSecs=90):
    if not node: node = h2o.nodes[0]
    kwargs = {'filter': pattern}
    nodeCnt = 0
    offset = 0
    while True:
        # we get 20 at a time with default storeView
        # if we get < 20, we're done
        storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=timeoutSecs, offset=offset, view=20, **kwargs)
        keys = storeViewResult['keys']
        if not keys:
            break
        nodeCnt += len(storeViewResult['keys'])
        if len(keys) < 20:
            break
        offset += 20

    print nodeCnt, "keys at %s:%s" % (node.http_addr, node.port)
    return nodeCnt
Esempio n. 50
0
    def test_billion_rows(self):
        # just do the import folder once
        timeoutSecs = 1500

        csvFilenameAll = [
            # quick test first
            # "covtype.data",
            # then the real thing
            "billion_rows.csv.gz",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path='standard/' + csvFilename,
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            parameters = {
                'response_column': 1,
                'n_folds': 0,
                'alpha': 0,
                'lambda': 0,
            }
            model_key = 'B.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          destination_key=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()

            labelListUsed = labelList
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)
Esempio n. 51
0
    def test_RF_mnist_reals(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            # ("a.csv", "b.csv", 60),
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        succeededList = importFolderResult['files']
        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 testCsvFilename,
                                                 importFolderPath,
                                                 key2=testKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 trainCsvFilename,
                                                 importFolderPath,
                                                 key2=trainKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseKey['destination_key'],
                timeoutSecs=300,
                forRF=True)
            ntree = 100
            params = {
                'response_variable':
                0,
                'ignore':
                ignore_x,
                'ntree':
                ntree,
                'iterative_cm':
                1,
                'out_of_bag_error_estimate':
                1,
                # 'data_key='mnist_reals_training.csv.hex'
                'features':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit':
                None,
                'depth':
                2147483647,
                'stat_type':
                'ENTROPY',
                'sampling_strategy':
                'RANDOM',
                'sample':
                67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key':
                'RF_model',
                'bin_limit':
                1024,
                'seed':
                784834182943470027,
                'parallel':
                1,
                'use_non_local_data':
                0,
                'class_weights':
                '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       rfView=False,
                                       timeoutSecs=timeoutSecs,
                                       pollTimeoutsecs=60,
                                       retryDelaySecs=2,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2,
                                       model_key=modelKey,
                                       ntree=ntree,
                                       out_of_bag_error_estimate=0,
                                       timeoutSecs=60,
                                       pollTimeoutSecs=60,
                                       noSimpleCheck=False,
                                       **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            self.assertAlmostEqual(
                classification_error,
                0.03,
                delta=0.5,
                msg="Classification error %s differs too much" %
                classification_error)
            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Esempio n. 52
0
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]
        # IMPORT**********************************************
        # H2O deletes the source key. So re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        # the list could be from hdfs/s3 (ec2 remap) or local. They have to different list structures
        if 'succeeded' in importFolderResult:
            succeededList = importFolderResult['succeeded']
        elif 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            raise Exception("Can't find 'files' or 'succeeded' in import list")

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 3,
                           "Should see more than 3 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            key2 = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values,
            # to see if we have x specified well
            # figures out everything from parseKey['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(
                y=0, key=parseKey['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"

            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt",
                         "w")
                result = json.dump(storeViewResult,
                                   f,
                                   indent=4,
                                   sort_keys=True,
                                   default=str)
                f.close()
                lastStoreViewResult = storeViewResult

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
Esempio n. 53
0
    def test_RF_mnist_both(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult,
         importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                          path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed,
             parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           testCsvFilename,
                                           hex_key=testKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           parsePattern,
                                           hex_key=trainKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            # print "This is the 'ignore=' we'll use"
            # no longer use. depend on h2o to get it right.
            ntree = 25
            params = {
                'response': 0,
                'ntrees': ntree,
                # 'data_key='mnist_training.csv.hex'
                'mtries':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 2147483647,
                'select_stat_type': 'ENTROPY',
                'sampling_strategy': 'RANDOM',
                'sample_rate': 0.67,
                'oobee': 1,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'destination_key': 'RF_model',
                'nbins': 1024,
                # 'seed': 784834182943470027,
                # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            if rfSeed is None:
                params['seed'] = random.randint(0, sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        timeoutSecs=timeoutSecs,
                                        pollTimeoutSecs=180,
                                        retryDelaySecs=2,
                                        **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            # RFView (score on test)****************************************
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            # was 2.84
            # sometimes get 2.87?
            self.assertAlmostEqual(
                classification_error,
                1.6,
                delta=1.6,
                msg="Classification error %s differs too much" %
                classification_error)

            treeStats = rfView['speedrf_model']['treeStats']
            leaves = {
                'min': treeStats['minLeaves'],
                'mean': treeStats['meanLeaves'],
                'max': treeStats['maxLeaves']
            }
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100
                d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (
                    params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = {
                'min': treeStats['minDepth'],
                'mean': treeStats['meanDepth'],
                'max': treeStats['maxDepth']
            }
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100
                d = "seed: %s %s depth: %s expected: %s pct. different %s" % (
                    params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            modelKey = rfView['speedrf_model']['_key']
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"

        for d in allDelta:
            print d
Esempio n. 54
0
    def test_DL_mnist(self):
        h2o.nodes[0].remove_all_keys()
        csvPathname_train = 'laptop/mnist/train.csv.gz'
        csvPathname_test = 'laptop/mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='bigdata',
                                       path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        numCols = iA.numCols
        labelList = iA.labelList
        parseResultV = h2i.import_parse(bucket='bigdata',
                                        path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=False)

        response = numCols - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        parameters = {
            'validation_frame': validation_key,  # KeyIndexed None
            'ignored_columns': None,  # string[] None
            'response_column': labelList[response],  # string None
            'balance_classes': None,  # boolean false
            'max_after_balance_size': None,  # float Infinity
            'keep_cross_validation_splits': None,  # boolean false
            'checkpoint': None,  # Key None
            'overwrite_with_best_model': None,  # boolean true
            'expert_mode': None,  # boolean false
            'autoencoder': None,  # boolean false
            'use_all_factor_levels': None,  # boolean true
            # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout']
            'activation': 'RectifierWithDropout',  # enum Rectifier 
            'hidden': '[117,131,129]',  # int[] [200, 200]
            'epochs': 2.0,  # double 10.0
            'train_samples_per_iteration': None,  # long -2
            'target_ratio_comm_to_comp': None,  # double 0.02
            'seed': None,  # long 1679194146842485659
            'adaptive_rate': False,  # boolean true
            'rho': None,  # double 0.99
            'epsilon': None,  # double 1.0E-8
            'rate': None,  # double 0.005
            'rate_annealing': None,  # double 1.0E-6
            'rate_decay': None,  # double 1.0
            'momentum_start': 0.5,  # double 0.0
            'momentum_ramp': 100000,  # double 1000000.0
            'momentum_stable': 0.9,  # double 0.0
            'nesterov_accelerated_gradient': None,  # boolean true
            'input_dropout_ratio': 0.2,  # double 0.0
            'hidden_dropout_ratios': None,  # double[] None (this can grid?)
            'l1': 1e-5,  # double 0.0
            'l2': 1e-7,  # double 0.0
            'max_w2': 15,  # float Infinity
            'initial_weight_distribution':
            None,  # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal']
            'initial_weight_scale': None,  # double 1.0
            'loss':
            'CrossEntropy',  # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy']
            'score_interval': None,  # double 5.0
            'score_training_samples': None,  # long 10000
            'score_validation_samples': None,  # long 0
            'score_duty_cycle': None,  # double 0.1
            'classification_stop': None,  # double 0.0
            'regression_stop': None,  # double 1.0E-6
            'quiet_mode': None,  # boolean false
            'max_confusion_matrix_size': None,  # int 20
            'max_hit_ratio_k': None,  # int 10
            'balance_classes': None,  # boolean false
            'class_sampling_factors': None,  # float[] None
            'max_after_balance_size': None,  # float Infinity
            'score_validation_sampling':
            None,  # enum Uniform [u'Uniform', u'Stratified']
            'diagnostics': None,  # boolean true
            'variable_importances': None,  # boolean false
            'fast_mode': None,  # boolean true
            'ignore_const_cols': None,  # boolean true
            'force_load_balance': None,  # boolean true
            'replicate_training_data': None,  # boolean false
            'single_node_mode': None,  # boolean false
            'shuffle_training_data': None,  # boolean false
            'missing_values_handling':
            None,  # enum MeanImputation [u'Skip', u'MeanImputation']
            'sparse': None,  # boolean false
            'col_major': None,  # boolean false
            'average_activation': None,  # double 0.0
            'sparsity_beta': None,  # double 0.0
        }
        expectedErr = 0.057  ## expected validation error for the above model
        relTol = 0.20  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 60
        start = time.time()

        bmResult = h2o.n0.build_model(algo='deeplearning',
                                      model_id=model_key,
                                      training_frame=hex_key,
                                      parameters=parameters,
                                      timeoutSecs=timeoutSecs)
        bm = OutputObj(bmResult, 'bm')

        print 'deep learning took', time.time() - start, 'seconds'

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')
        #        print "model:", dump_json(model)

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=validation_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=validation_key,
                                        timeoutSecs=60)
        mm = OutputObj(mmResult['model_metrics'][0], 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=validation_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

        h2o_cmd.runStoreView()

        actualErr = model['errors']['valid_err']
        print "expected classification error: " + format(expectedErr)
        print "actual   classification error: " + format(actualErr)

        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Esempio n. 55
0
    def test_rf_mnist_both_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename,
                hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern,
                hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            print "Not using ignore from this..have to adjust cols?"
            h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)
            ntree = 2
            params = {
                'response': 'C1',
                # 'ignored_cols_by_name': ignore_x, 
                'ntrees': ntree,
                'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': 'RF_model',
                'nbins': 100,
                'importance': 0,
                'balance_classes': 0,
                }

            if rfSeed is None:
                params['seed'] = random.randint(0,sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True,
                timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            # print 'rfView:', h2o.dump_json(rfView)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['drf_model']['_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, 
                timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            # training and test data are unique, so error won't be low?
            # self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error)

            leaves = {
                'min': rfView['drf_model']['treeStats']['minLeaves'],
                'mean': rfView['drf_model']['treeStats']['meanLeaves'],
                'max': rfView['drf_model']['treeStats']['maxLeaves'],
            }
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 537, 'mean': 1118.05, 'max': 1701}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100
                d = "seed: %s leaves %s %s %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = {
                'min': rfView['drf_model']['treeStats']['minDepth'],
                'mean': rfView['drf_model']['treeStats']['meanDepth'],
                'max': rfView['drf_model']['treeStats']['maxDepth'],
            }
            depthExpected = {'min': 20, 'mean': 20, 'max': 20}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100
                d = "seed: %s depth %s %s %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"
    
        for d in allDelta:
            print d
Esempio n. 56
0
    def test_RF_mnist_both(self):
        importFolderPath = "/home/0xdiag/datasets/mnist_repl"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None,
             '*mnist_training*gz'),
            ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None,
             '*mnist_training*gz'),
            ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None,
             '*mnist_training*gz'),
            ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None,
             '*mnist_training*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        succeededList = importFolderResult['files']
        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed,
             parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 testCsvFilename,
                                                 importFolderPath,
                                                 key2=testKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 parsePattern,
                                                 importFolderPath,
                                                 key2=trainKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseKey['destination_key'],
                timeoutSecs=300,
                forRF=True)
            ntree = 100
            params = {
                'response_variable':
                0,
                'ignore':
                ignore_x,
                'ntree':
                ntree,
                'iterative_cm':
                1,
                'out_of_bag_error_estimate':
                1,
                # 'data_key='mnist_training.csv.hex'
                'features':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit':
                None,
                'depth':
                2147483647,
                'stat_type':
                'ENTROPY',
                'sampling_strategy':
                'RANDOM',
                'sample':
                67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key':
                'RF_model',
                'bin_limit':
                1024,
                # 'seed': 784834182943470027,
                'parallel':
                1,
                'use_non_local_data':
                0,
                'class_weights':
                '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            if rfSeed is None:
                params['seed'] = random.randint(0, sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       rfView=False,
                                       timeoutSecs=timeoutSecs,
                                       pollTimeoutsecs=60,
                                       retryDelaySecs=2,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2,
                                       model_key=modelKey,
                                       ntree=ntree,
                                       out_of_bag_error_estimate=0,
                                       timeoutSecs=60,
                                       pollTimeoutSecs=60,
                                       noSimpleCheck=False,
                                       **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            print "classification error is expected to be low because we included the test data in with the training!"
            self.assertAlmostEqual(
                classification_error,
                0.028,
                delta=0.01,
                msg="Classification error %s differs too much" %
                classification_error)

            leaves = rfView['trees']['leaves']
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100
                d = "seed: %s leaves %s %s %s pct. different %s" % (
                    params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = rfView['trees']['depth']
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100
                d = "seed: %s depth %s %s %s pct. different %s" % (
                    params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"
        for d in allDelta:
            print d
Esempio n. 57
0
    def test_w2v_basic_1(self):
        global SYNDATASETS_DIR
        SYNDATASETS_DIR = h2o.make_syn_dir()
        n = 500000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:

            csvPathname = create_file_with_seps(rowCount, colCount)

            # just parse to make sure it's good
            parseResult = h2i.import_parse(path=csvPathname,
                                           checkHeader=1,
                                           delete_on_done=0,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            cA = h2o_test.OutputObj(iA.columns[0], "inspect_column")

            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            for i in range(colCount):
                print cA.type, cA.missing
                self.assertEqual(
                    0, cA.missing,
                    "Column %s Expected %s. missing: %s is incorrect" %
                    (i, 0, cA.missing))
                self.assertEqual(
                    'string', cA.type,
                    "Column %s Expected %s. type: %s is incorrect" %
                    (i, 0, cA.type))

            if DO_SUMMARY:
                for i in range(colCount):
                    co = h2o_cmd.runSummary(key=parse_key, column=i)
                    print co.label, co.type, co.missing, co.domain, sum(
                        co.bins)
                    self.assertEqual(
                        0, co.missing,
                        "Column %s Expected %s. missing: %s is incorrect" %
                        (i, 0, co.missing))
                    self.assertEqual(
                        'String', co.type,
                        "Column %s Expected %s. type: %s is incorrect" %
                        (i, 0, co.type))

            # no cols ignored
            labelListUsed = list(labelList)
            numColsUsed = numCols
            for trial in range(1):

                parameters = {
                    'validation_frame': parse_key,  # KeyIndexed False []
                    'ignored_columns': None,  # string[] None []
                    'score_each_iteration': None,  # boolean false []
                    'minWordFreq': 5,  # int 5 []
                    'wordModel': 'SkipGram',  # enum [u'CBOW', u'SkipGram']
                    'normModel': 'HSM',  # enum # [u'HSM', u'NegSampling']
                    'negSampleCnt': 5,  # int 5 []
                    'vecSize': 100,  # int 100
                    'windowSize': 5,  # int 5
                    'sentSampleRate': 0.001,  # float 0.001
                    'initLearningRate': 0.05,  # float 0.05
                    'epochs': 1,  # int 5
                }

                model_key = 'benign_w2v.hex'
                bmResult = h2o.n0.build_model(algo='word2vec',
                                              destination_key=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=60)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                # not implemented?

                # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
                # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

                h2o_cmd.runStoreView()
Esempio n. 58
0
    def test_exec2_xorsum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax,
                 expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(
                    None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum,
                 expectedFpSum) = write_syn_dataset(csvPathname, rowCount,
                                                    colCount, expectedMin,
                                                    expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(
                    expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(
                    expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               timeoutSecs=3000,
                                               retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
                        start = time.time()
                        (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0],
                                                               execExpr,
                                                               resultKey='h',
                                                               timeoutSecs=300)
                        print r, 'exec took', time.time() - start, 'seconds'
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()
                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                            ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                            expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                            expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult != expectedUllSum:
                            raise Exception(
                                "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x"
                                % (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (
                                ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                        ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                    expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                    expectedFpSumAsLongLong, expectedFpSum)
Esempio n. 59
0
    def test_DL_airlines_small(self):
        h2o.nodes[0].remove_all_keys()
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'train.hex'
        validation_key = 'validation.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)

        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=False)
        pAV = h2o_cmd.ParseObj(parseResultV)
        iAV = h2o_cmd.InspectObj(pAV.parse_key)

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        parameters = {
            'validation_frame': validation_key,  # KeyIndexed None
            'ignored_columns': "['IsDepDelayed_REC']",  # string[] None
            'response_column': 'IsDepDelayed',  # string None
            'loss': 'CrossEntropy'
        }
        expectedErr = 0.32  ## expected validation error for the above model
        relTol = 0.15  ## 15% rel. error tolerance due to Hogwild!

        timeoutSecs = 60
        start = time.time()

        bmResult = h2o.n0.build_model(algo='deeplearning',
                                      model_id=model_key,
                                      training_frame=hex_key,
                                      parameters=parameters,
                                      timeoutSecs=timeoutSecs)
        bm = OutputObj(bmResult, 'bm')

        print 'deep learning took', time.time() - start, 'seconds'

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')
        #        print "model:", dump_json(model)

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=validation_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=validation_key,
                                        timeoutSecs=60)
        mm = OutputObj(mmResult['model_metrics'][0], 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=validation_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

        h2o_cmd.runStoreView()

        actualErr = model['errors']['valid_err']
        print "expected classification error: " + format(expectedErr)
        print "actual   classification error: " + format(actualErr)

        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Esempio n. 60
0
    def test_parse_multiprocess_fvec(self):
        h2o.beta_features = True
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        # don't raise exception if we find something bad in h2o stdout/stderr?
        # h2o.nodes[0].sandboxIgnoreErrors = True
        OUTSTANDING = min(10, len(h2o.nodes))

        if DO_IRIS:
            global DO_BIGFILE
            DO_BIGFILE = False
            bucket = 'smalldata'
            importFolderPath = "iris"
            csvFilename = "iris2.csv"
            csvFilePattern = "iris2.csv"
            if localhost:
                trialMax = 20
            else:
                trialMax = 100
        elif DO_BIGFILE:
            bucket = 'home-0xdiag-datasets'
            importFolderPath = "standard"
            csvFilename = "covtype20x.data"
            csvFilePattern = "covtype20x.data"
            trialMax = 2 * OUTSTANDING
        else:
            bucket = 'home-0xdiag-datasets'
            importFolderPath = "standard"
            csvFilename = "covtype.data"
            csvFilePattern = "covtype.data"
            trialMax = 40 * OUTSTANDING

        # add one just to make it odd
        # OUTSTANDING = min(10, len(h2o.nodes) + 1)
        # don't have more than one source file per node OUTSTANDING? (think of the node increment rule)
    
        # okay to reuse the src_key name. h2o deletes? use unique hex to make sure it's not reused.
        # might go to unique src keys also ..oops have to, to prevent complaints about the key (lock)
        # can't repeatedly import the folder

        # only if not noPoll. otherwise parse isn't done
        # I guess I have to use 'put' so I can name the src key unique, to get overlap
        # I could tell h2o to not delete, but it's nice to get the keys in a new place?
        # maybe rebalance? FIX! todo

        parseTrial = 0
        summaryTrial = 0
        uploader_resultq = multiprocessing.Queue()
        while parseTrial <= trialMax:
            start = time.time()
            uploaders = []
            if not DO_IRIS:
                assert OUTSTANDING<=10 , "we only have 10 links with unique names to covtype.data"
            for o in range(OUTSTANDING):
                src_key = csvFilename + "_" + str(parseTrial) 
                hex_key = csvFilename + "_" + str(parseTrial) + ".hexxx"
                # "key": "hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                # hacked hard ln so source keys would have different names? was getting h2o locking issues
                if DO_IRIS:
                    csvPathname = importFolderPath + "/" + csvFilePattern
                else:
                    csvPathname = importFolderPath + "/" + csvFilePattern + "_" + str(o)
                start = time.time()

                # walk the nodes
                # if this rule is matched for exec/summary below, it should find the name okay? (npe with xorsum)
                # summary2 not seeing it?
                np = parseTrial % len(h2o.nodes)
                retryDelaySecs=5 if DO_BIGFILE else 1
                timeoutSecs=60 if DO_BIGFILE else 15
                tmp = multiprocessing.Process(target=function_no_keyboard_intr,
                    args=(uploader_resultq, uploadit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs))
                tmp.start()
                uploaders.append(tmp)
                parseTrial += 1

            # now sync on them
            for uploader in uploaders:
                try:
                    uploader.join()
                    # don't need him any more
                    uploader.terminate()
                    (importPattern, hex_key) = uploader_resultq.get(timeout=10)
                except KeyboardInterrupt:
                    print 'parent received ctrl-c'
                    for uploader in uploaders:
                        uploader.terminate()
                        uploader.join()
            elapsed = time.time() - start
            print "Parse group end at #", parseTrial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        print "We might have parses that haven't completed. The join just says we can reuse some files (parse still going)"
        if PARSE_NOPOLL:
            h2o_jobs.pollWaitJobs(timeoutSecs=180)

        h2o_cmd.runStoreView()
        # h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=0.25)

        if DO_PARSE_ALSO: # only if we parsed
            print "These all go to node [0]"
            # getting a NPE if I do xorsum (any exec?) ..just do summary for now..doesn't seem to have the issue
            # suspect it's about the multi-node stuff above
            for summaryTrial in range(trialMax):

                # do last to first..to get race condition?
                firstXorUll = None
                firstQuantileUll = None
                hex_key = csvFilename + "_" + str(summaryTrial) + ".hexxx"
                
                if DO_EXEC_QUANT:
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, thresholds)
                    (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                    print "%30s" % "median ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                    if firstQuantileUll:
                        self.assertEqual(ullResult, firstQuantileUll)
                    else:
                        firstQuantileUll = ullResult

                if DO_XORSUM:
                    execExpr = "r2=c(1); r2=xorsum(%s[,1], c(%s));" % (hex_key, thresholds)
                    (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                    print "%30s" % "xorsum ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                    if firstXorUll:
                        self.assertEqual(ullResult, firstXorUll)
                    else:
                        firstXorUll = ullResult

                if DO_SUMMARY:
                    h2o_cmd.runSummary(key=hex_key)