def test_exec2_col_scalar(self):
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        maxTrials = 20
        csvFilenameAll = [
            ("covtype.data", 15),
            ("covtype20x.data", 60),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ## h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        importFolderPath = "standard"

        # just always use the same hex_key, so the zeroList is right all the time
        hex_key = 'cA'
        for (csvFilename, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, 
                maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
Exemple #2
0
    def test_exec2_dkv(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilenameAll = [
            ("syn_10x8.csv", 'cA', 15),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (csvFilename, hex_key, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random 10x8 csv"
            write_syn_dataset(csvPathname, 10, SEEDPERFILE)
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 6
            h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, 
                maxCol=6, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
Exemple #3
0
    def test_factor_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # use SEED so the file isn't cached?
        csvFilenameAll = [
            ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random 1mx8 csv"
            write_syn_dataset(csvPathname, 1000000, SEEDPERFILE)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 6
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
    def test_exec2_na_chop(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'airlines/year2013.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        inspect = h2o_cmd.runInspect(key='i.hex')
        print "\nr.hex" \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        numRows1 = inspect['numRows']
        numCols = inspect['numCols']

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, keyX='s.hex', maxTrials=200, timeoutSecs=30, maxCol=numCols-1)

        inspect = h2o_cmd.runInspect(key='s.hex')
        print "\ns.hex" \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        numRows2 = inspect['numRows']

        print numRows1, numRows2


        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
    def test_exec2_operators2(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()
        
        bigExecExpr = ""
        expCnt = 0
        for execExpr in exprList:
            bigExecExpr += execExpr + ";"
            h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4)
            expCnt += 1
            # limit to 5 expressions and see what happens
            if expCnt > 2:
                bigExecExpr = ""
                expCnt = 0
                

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
    def test_exec_import_hosts(self):
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
                ("covtype.data", "cB", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
                ("covtype20x.data", "cC", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ## h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        importFolderPath = "standard"
        for (csvFilename, hex_key, timeoutSecs) in csvFilenameList:
            # import each time, because h2o deletes source file after parse
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['Key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, 
                maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
Exemple #7
0
    def test_exec_import_hosts(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2o_import.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameAll = [
            ("covtype.data", "cA", 5),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['desination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
Exemple #8
0
    def test_exec_operators(self):
        if 1 == 1:
            for execExpr in initList:
                h2e.exec_expr(h2o.nodes[0],
                              execExpr,
                              resultKey="Result.hex",
                              timeoutSecs=4)
        else:
            # init with put_value
            for i in range(0, 5):
                key = "ResultUnparsed" + str(i)
                put = h2o.nodes[0].put_value(i, key=key, repl=None)
                # have to parse the key after you put_value it. put_value should parse the result first!
                key2 = "Result" + str(i)
                parse = h2o.nodes[0].parse(put['key'], key2, timeoutSecs=10)

        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes),
                                exprList,
                                None,
                                maxTrials=200,
                                timeoutSecs=10)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", 'took', time.time(
        ) - start, 'seconds'
Exemple #9
0
    def test_factor_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # use SEED so the file isn't cached?
        csvFilenameAll = [
            ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random 1mx8 csv"
            write_syn_dataset(csvPathname, 1000000, SEEDPERFILE)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 6
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
Exemple #10
0
    def test_slice(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2o_import.setupImportFolder(None, importFolderPath)

        csvFilenameAll = [
            ("covtype.data", "cA", 5),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2o_import.parseImportFolderFile(None, 
                csvFilename, importFolderPath, key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['desination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # try the error case list
            # I suppose we should test the expected error is correct. 
            # Right now just make sure things don't blow up
            h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, 
                maxCol=53, maxRow=400000, maxTrials=5, 
                timeoutSecs=timeoutSecs, ignoreH2oError=True)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
Exemple #11
0
    def test_exec2_operators4(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=10)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()
        
        bigExecExpr = ""
        expCnt = 0

        for t in range(200):
            execExpr = random.choice(exprList)
            bigExecExpr += execExpr + ";"
            h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=10)
            expCnt += 1
            # limit to 2 expressions. 
            # Also: functions must be solitary
            # Also: ifelse() must be solitary
            # Also: ternary operators must be solitary
            if expCnt > 2 or 'function' in execExpr or 'ifelse' in execExpr or "?" in execExpr:
                bigExecExpr = ""
                expCnt = 0
                

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Exemple #12
0
    def test_exec2_unary(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'airlines/year2013.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=10)
        start = time.time()
        # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10)
        h2e.exec_expr_list_rand(len(h2o.nodes),
                                exprList,
                                None,
                                maxTrials=200,
                                timeoutSecs=30,
                                allowEmptyResult=True,
                                nanOkay=True)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", 'took', time.time(
        ) - start, 'seconds'
Exemple #13
0
    def test_exec2_col_scalar(self):
        h2o.beta_features = True
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [("covtype.data", "cA", 15)]
        else:
            maxTrials = 20
            csvFilenameAll = [("covtype.data", "cA", 15), ("covtype20x.data", "cC", 60)]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ## h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        importFolderPath = "standard"

        for (csvFilename, hex_key, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, timeoutSecs=2000
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            h2e.exec_expr_list_rand(
                lenNodes, exprList, hex_key, maxCol=54, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs
            )
    def test_exec2_operators4(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()
        
        bigExecExpr = ""
        expCnt = 0

        for t in range(200):
            execExpr = random.choice(exprList)
            bigExecExpr += execExpr + ";"
            h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4)
            expCnt += 1
            # limit to 2 expressions. 
            # Also: functions must be solitary
            # Also: ifelse() must be solitary
            # Also: ternary operators must be solitary
            if expCnt > 2 or 'function' in execExpr or 'ifelse' in execExpr or "?" in execExpr:
                bigExecExpr = ""
                expCnt = 0
                

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Exemple #15
0
    def test_slice(self):
        importFolderPath = "standard"
        csvFilenameAll = [
            ("covtype.data", "cA", 5),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        lenNodes = len(h2o.nodes)
        for (csvFilename, hex_key, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['desination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # try the error case list
            # I suppose we should test the expected error is correct. 
            # Right now just make sure things don't blow up
            h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, hex_key, 
                maxCol=53, maxRow=400000, maxTrials=5, 
                timeoutSecs=timeoutSecs, ignoreH2oError=True)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, 
                maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
Exemple #16
0
    def test_dkv(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilenameAll = [
            ("syn_10x8.csv", 'cA', 5),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (csvFilename, hex_key, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random 10x8 csv"
            write_syn_dataset(csvPathname, 10, SEEDPERFILE)
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 6
            h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, 
                maxCol=6, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
    def test_loop_random_exec_covtype(self):
        csvPathname = 'UCI/UCI-large/covtype/covtype.data'
        parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=15)
        print "\nParse key is:", parseResult['destination_key']

        # h2b.browseTheCloud()
        h2e.exec_zero_list(zeroList)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex',
            maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15)
        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
    def test_loop_random_exec_covtype(self):
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=15)
        print "\nParse key is:", parseResult['destination_key']

        # h2b.browseTheCloud()
        h2e.exec_zero_list(zeroList)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex',
            maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15)
        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
Exemple #19
0
    def test_loop_random_exec_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 15)
        print "\nParse key is:", parseKey['destination_key']

        h2b.browseTheCloud()
        h2e.exec_zero_list(zeroList)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex',
            maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=5)
        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
Exemple #20
0
    def test_exec_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cB", 15),
                ("covtype20x.data", "cD", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            cnum += 1
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            # we use factor in this test...so timeout has to be bigger!
            h2e.exec_expr_list_rand(lenNodes,
                                    exprList,
                                    key2,
                                    maxCol=53,
                                    maxRow=400000,
                                    maxTrials=maxTrials,
                                    timeoutSecs=(timeoutSecs))
    def test_exec2_operators2(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r0.hex', maxTrials=200, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Exemple #22
0
    def test_exec2_operators(self):
        bucket = "smalldata"
        csvPathname = "iris/iris2.csv"
        hexKey = "i.hex"
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, "r0.hex", maxTrials=200, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", "took", time.time() - start, "seconds"
    def test_vector_filter_factor(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 5),
                ("covtype.data", "cB", 5),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cA", 5),
                ("covtype20x.data", "cC", 50),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # have to import each time, because h2o deletes the source file after parse
            h2i.setupImportFolder(None, importFolderPath)
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 53
            h2e.exec_expr_list_rand(lenNodes,
                                    exprList,
                                    key2,
                                    maxCol=53,
                                    maxRow=400000,
                                    maxTrials=maxTrials,
                                    timeoutSecs=timeoutSecs)
    def test_exec2_covtype_rand1(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=15)
        print "\nParse key is:", parseResult['destination_key']

        ### h2b.browseTheCloud()
        h2e.exec_zero_list(zeroList)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', 
            maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
Exemple #25
0
    def test_exec2_operators(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'airlines/year2013.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
    def test_exec_operators(self):
        h2o.beta_features = True

        for i, execExpr in enumerate(initList):
            if h2o.beta_features: # no default result
                resultKey = "Result" + str(i)
            else:
                resultKey = "Result.hex"
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)

        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Exemple #27
0
    def test_exec2_unary(self):
        h2o.beta_features = True
        bucket = "home-0xdiag-datasets"
        csvPathname = "airlines/year2013.csv"
        hexKey = "i.hex"
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
        start = time.time()
        # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10)
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=30)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", "took", time.time() - start, "seconds"
    def test_loop_random_exec_covtype(self):
        csvPathname = "UCI/UCI-large/covtype/covtype.data"
        parseResult = h2i.import_parse(
            bucket="datasets", path=csvPathname, schema="put", hex_key="c.hex", timeoutSecs=15
        )
        print "\nParse key is:", parseResult["destination_key"]

        h2b.browseTheCloud()
        h2e.exec_zero_list(zeroList)
        start = time.time()
        h2e.exec_expr_list_rand(
            len(h2o.nodes), exprList, "c.hex", maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=10
        )

        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data", "took", time.time() - start, "seconds"
Exemple #29
0
    def test_exec_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
                ("covtype20x.data", "cC", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            h2e.exec_expr_list_rand(lenNodes,
                                    exprList,
                                    key2,
                                    maxCol=54,
                                    maxRow=400000,
                                    maxTrials=maxTrials,
                                    timeoutSecs=timeoutSecs)
Exemple #30
0
    def test_exec_2(self):
        # exec2 doesn't autoframe? fvec everything
        h2o.beta_features = True
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA.hex", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cA.hex", 15),
                ("covtype20x.data", "cA.hex", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ## h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        importFolderPath = "standard"

        for (csvFilename, hex_key, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print "Parse result['desination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes,
                                    exprList,
                                    hex_key,
                                    maxCol=53,
                                    maxRow=400000,
                                    maxTrials=maxTrials,
                                    timeoutSecs=timeoutSecs)
    def test_exec_operators(self):
        if 1==1:
            for execExpr in initList:
                h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result.hex", timeoutSecs=4)
        else:
            # init with put_value
            for i in range(0,5):
                key = "ResultUnparsed" + str(i)
                put = h2o.nodes[0].put_value(i, key=key, repl=None)
                # have to parse the key after you put_value it. put_value should parse the result first!
                hex_key = "Result" + str(i) 
                parse = h2o.nodes[0].parse(put['key'], hex_key, timeoutSecs=10)

        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Exemple #32
0
    def test_slice(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2o_import.setupImportFolder(None, importFolderPath)

        csvFilenameAll = [
            ("covtype.data", "cA", 5),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2o_import.parseImportFolderFile(None,
                                                        csvFilename,
                                                        importFolderPath,
                                                        key2=key2,
                                                        timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['desination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # try the error case list
            # I suppose we should test the expected error is correct.
            # Right now just make sure things don't blow up
            h2e.exec_expr_list_rand(lenNodes,
                                    exprErrorCaseList,
                                    key2,
                                    maxCol=53,
                                    maxRow=400000,
                                    maxTrials=5,
                                    timeoutSecs=timeoutSecs,
                                    ignoreH2oError=True)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes,
                                    exprList,
                                    key2,
                                    maxCol=53,
                                    maxRow=400000,
                                    maxTrials=100,
                                    timeoutSecs=timeoutSecs)
    def test_exec_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cB", 15),
                ("covtype20x.data", "cD", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            cnum += 1
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            # we use factor in this test...so timeout has to be bigger!
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
Exemple #34
0
    def test_loop_random_exec_covtype(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data',
                                     'c.hex', 15)
        print "\nParse key is:", parseKey['destination_key']

        h2b.browseTheCloud()
        h2e.exec_zero_list(zeroList)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes),
                                exprList,
                                'c.hex',
                                maxCol=54,
                                maxRow=400000,
                                maxTrials=200,
                                timeoutSecs=15)
        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data", 'took', time.time(
        ) - start, 'seconds'
    def test_exec2_operators2(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()
        
        for execExpr in exprList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
    def test_vector_filter_factor(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 5),
                ("covtype.data", "cB", 5),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cA", 5),
                ("covtype20x.data", "cC", 50),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # have to import each time, because h2o deletes the source file after parse
            h2i.setupImportFolder(None, importFolderPath)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 53
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
Exemple #37
0
    def test_exec2_operators2(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=resultKey,
                          timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes),
                                exprList,
                                None,
                                maxTrials=200,
                                timeoutSecs=10)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()

        bigExecExpr = ""
        expCnt = 0
        for execExpr in exprList:
            bigExecExpr += execExpr + ";"
            h2e.exec_expr(h2o.nodes[0],
                          bigExecExpr,
                          resultKey=None,
                          timeoutSecs=4)
            expCnt += 1
            # limit to 5 expressions and see what happens
            if expCnt > 2:
                bigExecExpr = ""
                expCnt = 0

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", 'took', time.time(
        ) - start, 'seconds'
Exemple #38
0
    def test_exec2_operators2(self):
        bucket = "smalldata"
        csvPathname = "iris/iris2.csv"
        hexKey = "i.hex"
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)

        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10, allowEmptyResult=True)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()

        bigExecExpr = ""
        for execExpr in exprBigList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4.0)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", "took", time.time() - start, "seconds"
    def test_exec_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            csvFilenameAll = [
                ("covtype.data", "cA", 5),
                ("covtype.data", "cB", 5),
            ]
        else:
            csvFilenameAll = [
                ("covtype.data", "cA", 5),
                ("covtype.data", "cB", 5),
                ("covtype20x.data", "cC", 50),
                ("covtype20x.data", "cD", 50),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
    def test_exec_import_hosts(self):
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cB", 15),
                ("covtype20x.data", "cD", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        importFolderPath = "standard"
        cnum = 0
        for (csvFilename, hex_key, timeoutSecs) in csvFilenameList:
            cnum += 1
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            # we use factor in this test...so timeout has to be bigger!
            h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, 
                maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
Exemple #41
0
    def test_exec_import_hosts(self):
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2o_import.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameAll = [
            ("covtype.data", "cA", 5),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2o_import.parseImportFolderFile(None,
                                                        csvFilename,
                                                        importFolderPath,
                                                        key2=key2,
                                                        timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['desination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes,
                                    exprList,
                                    key2,
                                    maxCol=53,
                                    maxRow=400000,
                                    maxTrials=200,
                                    timeoutSecs=timeoutSecs)
Exemple #42
0
    def test_exec_2(self):
        # exec2 doesn't autoframe? fvec everything
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if h2o.localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA.hex", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cA.hex", 15),
                ("covtype20x.data", "cA.hex", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ## h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        importFolderPath = "standard"

        for (csvFilename, hex_key, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['desination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes, exprList, hex_key, 
                maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)