def test_rapids_ifelse_nested(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for trial in range(2):
            for execObj, expected in zip(objList, resultList):
                freshObj = copy(execObj)
                result = freshObj.do()
                # do some scalar result checking
                if expected is not None:
                    # result is a string now??
                    print "result:", result
                    print "expected:", expected
                    # assert result==expected, "%s %s" (result,expected)

                # rows might be zero!
                print "freshObj:", dump_json(freshObj.execResult)
                if 'key' in freshObj.execResult and freshObj.execResult['key']:
                    keys.append(freshObj.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_basic(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'p'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for execExpr in exprList:
            r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr)
            resultKey = r.group(1)
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
            if DO_ROLLUP:
                h2o_cmd.runInspect(key=resultKey)
            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr)
            else:
                h2p.yellow_print("\nNo key created?\n", dump_json(execResult))

        print "\nExpressions that created keys. Shouldn't all of these expressions create keys"

        for k in keys:
            print k

        h2o.check_sandbox_for_errors()
    def test_xl_ast_assert_Z(self):
        a = DF('a1')
        checkAst(astForInit(a))
        b = DF('b1')
        checkAst(astForInit(b))
        c = DF('c1')
        checkAst(astForInit(c))

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        # this just overwrite the a/b/c with python datatypes
        if 1==0:
            a = 0
            checkAst("(= !a1 #0)")
            b = 0
            checkAst("(= !b1 #0)")
            c = 0
            checkAst("(= !c1 #0)")

            a = [0]
            checkAst("(= !a1 (c {#0}))")
            b = [0,1]
            checkAst("(= !b1 (c {#0;#1}))")
            c = [0,1,2]
            checkAst("(= !c1 (c {#0;#1;#2}))")

            a = (0,) # make sure it's a tuple with comma
            checkAst("(= !a1 (c {#0}))")
            b = (0,1)
            checkAst("(= !b1 (c {#0;#1}))")
            c = (0,1,2)
            checkAst("(= !c1 (c {#0;#1;#2}))")


        # added to init the keys, to avoid AAIOBE at h2o
        a <<= [0] # comma isn't needed
        checkAst("(= !a1 (c {#0}))")
        b <<= [0,1]
        checkAst("(= !b1 (c {#0;#1}))")
        c <<= [0,1,2]
        checkAst("(= !c1 (c {#0;#1;#2}))")

        # these don't work
        if 1==0:
            c = a[0] + b[1]
            # no .do() needed because of types on rhs? or ?
            c.do()
            checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

            c[0] = a[0] + b[1]
            c.do()
            checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
    def test_rapids_funs_basic(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for trial in range(100):
            for execExpr in funsList:
                funs = '[%s]' % execExpr
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=4)
                execExpr2 = '(apply %r1 #2 %anon)' 
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=4)
                # rows might be zero!
                if execResult['num_rows'] or execResult['num_cols']:
                    keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_basic_with_funs_noinc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for i in range(100):
            if i==0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind %v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ %v #1))'
                # execExpr1 = '(+ %v #1)'
                # add to itself?
                execExpr1 = '(+ %v %v)'
                funs = '[(def anon {v} %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon %v2))'
                execExpr2 = '(= !v2 (+ %v2 #1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15)


            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Example #6
0
    def tryThemAll(self, set, rows, enumsOnly=False):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            if enumsOnly:
                tcd = self.tokenChangeDict
            else:
                tcd = self.tokenChangeDictEnumsOnly

            for tokenCase in range(len(tcd)):
                newRows1 = self.changeTokens(rows, tokenCase, tcd)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    if "'" in tcd[tokenCase][0]:
                        singleQuotes = 1
                    else:
                        singleQuotes = 0
                    parseResult = h2i.import_parse(path=csvPathname, schema='local', singleQuotes=singleQuotes,
                        noPrint=not h2o_args.verbose, retryDelaySecs=0.1, 
                        doSummary=DO_SUMMARY, intermediateResults=DO_INTERMEDIATE_RESULTS)

                    if DO_RF:
                        h2o_cmd.runRF(parseResult=parseResult, trees=1,
                            timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=True)
                    verboseprint("Set", set)
                    h2o.check_sandbox_for_errors()
                    sys.stdout.write('.')
                    sys.stdout.flush()
    def test_exec2_reduction(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        if getpass.getuser()=='jenkins':
            csvPathname = 'standard/billion_rows.csv.gz'
        else:
            csvPathname = '1B/reals_1B_15f.data'
            csvPathname = '1B/reals_100000x1000_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

        for execExpr in initList:
            result = execExpr.do(timeoutSecs=30)

        for execExpr in exprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()
    def test_rapids_vec_fail1(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6),int(100e6),int(10e6)):
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            execExpr = '(= !v (+ %v %v))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30)
            elapsed2 = time.time() - start

            if execResult['num_rows']:
                keys.append(execExpr)
            
            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)


        if 1==1:
            xLabel = 'vector length'
            eLabel = 'elapsed (create v)'
            fLabel = 'elapsed (v = v + v)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_funs_basic2(self):
        if 1 == 1:
            bucket = "smalldata"
            csvPathname = "iris/iris_wheader.csv"
        else:
            bucket = "home-0xdiag-datasets"
            csvPathname = "standard/covtype.data"

        hexKey = "r1"
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey)

        keys = []
        for trial in range(5):
            for execExpr in funsList:
                funs = "[%s]" % execExpr
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=4)
                execExpr2 = "(= !junk (apply %r1 #2 %anon))"
                execResult, result = h2e.exec_expr(
                    h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15
                )
                # rows might be zero!
                if execResult["num_rows"] or execResult["num_cols"]:
                    keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
        def doAll(case):
            keys = []
            trial = 0
            for execExpr in exprList:
                # 4x4 cases per expression
                colons = [
                    # requires only 1 value on rhs
                    '#0 #0',
                    # '"null" #0',
                    # '#0 "null"',

                    # '"null" "null"',
                ]
                for colon in colons:
                    # what if the destination doesn't exist?. Use unique name for each, to see
                    t = "t%s" % trial
                    cases = [
                        # no colon 
                        '(= !{} {})'.format(t, execExpr),
                        # colon lhs
                        # '(= ([ %%s %s) %s)' % (t, colon, execExpr),
                        # colon rhs
                        # '(= !%s  ([ %s %s))' % (t, execExpr, colon),
                        # colon lhs and rhs
                        '(= ([ %{} {}) ([ {} {}))'.format(t, colon, execExpr, colon),
                    ]

                    for case in cases:
                        # init the data frame first to 0 (1 row, 1 col) 
                        print "\nt:", t, "case:", case
                        # can't init it to empty
                        '(= !%s (c {#0})' % t
                        execResult, result = h2e.exec_expr(h2o.nodes[0], case, resultKey=None, timeoutSecs=4)

                        # colonize it, to see if it blows up!
                        # since they all are assigns, they all are wrapped by '(= !<lhs> ...)
                        # unwrap the inner and wrap it with a colon then wrap it with the assign
                        # change the lhs to be coloned (row and/or col) and change the rhs to be a colon
                        # so four cases
                        # make sure the lhs assign key exists first
                        execResult, result = h2e.exec_expr(h2o.nodes[0], case, resultKey=None, timeoutSecs=4)
                        # rows/cols could be zero
                        # if execResult['num_rows'] or execResult['num_cols']:
                        # I think if key is not null, then that means a key got created
                        # oh, but exec deletes ones with leading "_" immediately? those are temp keys
                        # we'll put them in the list and see if we see them
                        if execResult['key']:
                            keys.append(execExpr)
                        trial += 1


                print "\nExpressions that created keys"
                for k in keys:
                    print k
                    if re.match('_', k):
                        raise Exception("%s I didn't expect any keys with leading underscores." +
                            "\nDoesn't spencer delete those so I can't read them?" % k)

                h2o.check_sandbox_for_errors()
Example #11
0
    def test_xl_seq_A(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1') # knon_* key
        b = DF('b1')
        c = DF('c1')
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)
        # look at our secret stash in the base class. Should see the DFInit?

        # DF does a kv store init. Key doesn't
        # DF inherits from Key. KeyIndexed inherits from Key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)

        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        Assign(a, 0)
        checkAst("(= !a1 #0)")
        Assign(b, 0)
        checkAst("(= !b1 #0)")
        Assign(c, 0)
        checkAst("(= !c1 #0)")

        Assign(a, [0])
        checkAst("(= !a1 (c {#0}))")
        Assign(b, [0,1])
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, [0,1,2])
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(a, (0,)) # make sure it's a tuple with comma
        checkAst("(= !a1 (c {#0}))")
        Assign(b, (0,1))
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, (0,1,2))
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(c, a[0] + b[1])
        checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        Assign(c[0], (a[0] + b[1]))
        checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
    def test_rapids_ddply_with_funs(self):
        if 1==0:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'
        else:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        # get rid of the enum response cole
        execExpr2 = '(= !r2 ([ %r1 "null" {#0;#1;#2;#3}))'
        execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15)

        keys = []
        for execExpr1 in initList:
            # ddply function can only return one row. Just use expressions above as nose
            # some of the expressions above use %v, but v won't be created as key outside any more with ddply
            funs = "[(def anon {v} " + "{};;(sum %v %TRUE);;;)]".format(execExpr1)
            execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5)

            execExpr2 = '(= !a h2o.ddply %r2 {#2;#3} %anon)'
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=120)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr1)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_basic_with_funs_pick5(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        while initList:
            if len(initList) >= 5:
                pick5 = [initList.pop(0) for i in range(5)]
            else:
                pick5 = initList
                global initList
                initList = []
            pick6 = ['(= !v (c {#1;#4567;(: #9 #90);(: #9 #45);#450})'] + pick5
            execExpr1 = ";;".join(pick6)
            # always do a v assign first, as they may reference %v

            funs = '[(def anon {x}  (%s);;;)]' % execExpr1
            execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5)
            execExpr2 = '(apply %r1 #2 %anon)'
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=25)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr1)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Example #14
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 

        h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime)
        if CHECK_WHILE_SLEEPING:        
            h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime)

        h2p.red_print("No checking of logs while sleeping, or check of cloud status")
        h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long")
        h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...")


        while (totalTime<maxTime): # die after 4 hours
            time.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)

            ### h2o.verify_cloud_size(timeoutSecs=120)
            if CHECK_WHILE_SLEEPING:        
                print "Checking sandbox log files"
                h2o.check_sandbox_for_errors(cloudShutdownIsError=True)
            else:
                print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime

        # don't do this, as the cloud may be hung?
        if 1==0:
            print "Shutting down cloud, but first delete all keys"
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #15
0
    def test_exec2_runif(self):
        print "in h2o-dev, params are column, min, max, seed"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            # params for h2o-dev runif are: column, min, max, seed 
            AssignObj('r0.hex', KeyIndexed('r.hex',col=0) ),
            AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1) ),
            AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1)  ),
            AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1) ),
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            results.append(result)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1==0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)
        

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0])
    def test_xl_ast_assert_Y(self):
        a = DF('a1')
        checkAst(astForInit(a))
        b = DF('b1')
        checkAst(astForInit(b))
        c = DF('c1')
        checkAst(astForInit(c))

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        a <<= 0
        checkAst("(= !a1 #0)")
        b <<= 0
        checkAst("(= !b1 #0)")
        c <<= 0
        checkAst("(= !c1 #0)")

        a <<= [0]
        checkAst("(= !a1 (c {#0}))")
        b <<= [0,1]
        checkAst("(= !b1 (c {#0;#1}))")
        c <<= [0,1,2]
        checkAst("(= !c1 (c {#0;#1;#2}))")

        a <<= (0,) # make sure it's a tuple with comma
        checkAst("(= !a1 (c {#0}))")
        b <<= (0,1)
        checkAst("(= !b1 (c {#0;#1}))")
        c <<= (0,1,2)
        checkAst("(= !c1 (c {#0;#1;#2}))")

        c <<= a[0] + b[1]
        checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        c[0] <<= a[0] + b[1]
        checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Example #17
0
    def test_rapids_basic(self):
        bucket = "smalldata"
        csvPathname = "iris/iris_wheader.csv"
        hexKey = "v"
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey)

        keys = []
        for execExpr in initList:
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
            # rows might be zero!
            if execResult["num_rows"] or execResult["num_cols"]:
                keys.append(execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_funs_basic3(self):
        DO_FAIL = False
        if DO_FAIL:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
        else:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []

        # works for 1 pass..why is execExpr set for 2nd pass? should be new instance?
        # if we reuse the same object in the list, it has state?
        # do we need to copy the object...hmm
        for trial in range(1):
            for execObj in funsList:
                freshObj = copy(execObj)
                result = freshObj.do()
                # rapids doesn't like complicated params right now?
                if DO_FAIL:
                    a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0)), do=False)
                else:
                    a = Assign('junk', Fcn('anon', 'r1'), do=False)
                result = a.do(timeoutSecs=60)

                # rows might be zero!
                if a.execResult['num_rows'] or a.execResult['num_cols']:
                    keys.append(a.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_funs_1000_stmnt(self):
        DO_FAIL = False
        if DO_FAIL:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
        else:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []

        for trial in range(3):
            for execObj in funsList:
                freshObj = copy(execObj)
                print "ast length:", len(str(freshObj))
                result = freshObj.do()

                # rapids doesn't like complicated params right now?
                if DO_FAIL:
                    a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0)))
                else:
                    a = Assign('junk', Fcn('anon', 'r1'))
                result = a.do(timeoutSecs=60)

                # rows might be zero!
                if a.execResult['num_rows'] or a.execResult['num_cols']:
                    keys.append(a.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Example #20
0
    def test_xl_oobe(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1') # knon_* key
        b = DF('b1')
        c = DF('c1')
        # look at our secret stash in the base class. Should see the DFInit?

        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)
        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        Assign(a, range(5))
        Assign(b, range(5))
        Assign(c, range(5))
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        # print "Referring to non-existent rows causes a problem (AAIOBE)"
        # not any more..change it to legal case
        Assign(c[1], (a[2] + b[2]))
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))"
        assert ast==astExpected, "Actual: %s    Expected: %s" % (ast, astExpected)

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Example #21
0
    def test_parse_time(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = None
        colCount = COLS
        # rowCount = 1000
        rowCount = ROWS
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)
        
        for trial in range (20):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            # src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "A trial #", trial
            # optional. only needed to extract parse_key?
            pA = h2o_cmd.ParseObj(parseResultA, expectedNumRows=rowCount, expectedNumCols=colCount)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(pA.parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])

            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(key=pA.parse_key, csvPathname=csvDownloadPathname)

            # do a little testing of saving the key as a csv
            # remove the original parsed key. source was already removed by h2o
            if 1==0:
                h2o.nodes[0].remove_key(pA.parse_key)

            # interesting. what happens when we do csv download with time data?
            parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', hex_key=hex_key)
            print "B trial #", trial
            pB = h2o_cmd.ParseObj(parseResultB, expectedNumRows=rowCount, expectedNumCols=colCount)
            print pB.numRows
            print pB.numCols
            print pB.parse_key
            iB = h2o_cmd.InspectObj(pB.parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])

            # these checks are redundant now
            self.assertEqual(iA.missingList, iB.missingList,
                "missingValuesList mismatches after re-parse of downloadCsv result")
            self.assertEqual(iA.numCols, iB.numCols,
                "numCols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # so I guess that's okay. So allow for an extra row here.
            self.assertEqual(iA.numRows, iB.numRows,
                "pA.numRows: %s pB.numRows: %s mismatch after re-parse of downloadCsv result" % \
                (iA.numRows, iB.numRows) )
            print "H2O writes the internal format (number) out for time."

            # ==> syn_time.csv <==
            # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30
            # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30

            # ==> csvDownload.csv <==
            # "0","1","2","3","4","5"
            # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12

            h2o.check_sandbox_for_errors()
Example #22
0
 def tearDown(self):
     h2o.nodes[0].log_download()
     h2o.check_sandbox_for_errors()
Example #23
0
    def test_rapids_cbind_vec(self):

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 10
        # for trial in range(maxx):
        # for trial in range(int(1e6),int(200e6),int(1e6)):
        for trial in [int(10e6)]:
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            # cols = 100
            xList = []
            eList = []
            fList = []
            for trial2 in range(0, 5):
            # for trial2 in range(0, 10):
            # fails. Post size?
            # for trial2 in range(0, 16):
                col = 2 ** trial2
                # assert col < 16384, "h2o can't take col == 16384 or more"
             
                vString = ' '.join(['%v' for x in range(col)])
                execExpr = '(= !v2 (cbind %s))' % vString

                # FIX! check the colnames. 2 cols get C1 and C10? odd 
                # try:
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40)
                elapsed2 = time.time() - start

                if execResult['num_rows']:
                    keys.append(execExpr)
                
                # except:
                #     elapsed2 = 0
                #     h2p.red_print("ERROR: col = %s failed" % col)

                if 1==0:
                    start = time.time()
                    execExpr = '(sum %v2 %TRUE)'
                    execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
                    elapsed1 = time.time() - start

                # xList.append(length)
                xList.append(col)
                eList.append(elapsed1)
                fList.append(elapsed2)


        if 1==1:
            xLabel = 'col'
            eLabel = 'elapsed (sum)'
            fLabel = 'elapsed (cbind cols)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Example #24
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4 * 3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600,
                         "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(
            h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!")

        h2p.blue_print(
            "Will Check cloud status every %s secs and kill cloud if wrong or no answer"
            % incrTime)
        if CHECK_WHILE_SLEEPING:
            h2p.blue_print(
                "Will also look at redirected stdout/stderr logs in sandbox every %s secs"
                % incrTime)

        h2p.red_print(
            "No checking of logs while sleeping, or check of cloud status")
        h2p.yellow_print(
            "So if H2O stack traces, it's up to you to kill me if 4 hours is too long"
        )
        h2p.yellow_print(
            "ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown..."
        )

        while (totalTime < maxTime):  # die after 4 hours
            time.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)

            ### h2o.verify_cloud_size(timeoutSecs=120)
            if CHECK_WHILE_SLEEPING:
                print "Checking sandbox log files"
                h2o.check_sandbox_for_errors(cloudShutdownIsError=True)
            else:
                print str(
                    datetime.datetime.now()
                ), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime

        # don't do this, as the cloud may be hung?
        if 1 == 0:
            print "Shutting down cloud, but first delete all keys"
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
 def tearDown(self):
     h2o.check_sandbox_for_errors()
    def test_rapids_basic_with_funs_noinc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []
        for i in range(100):
            if i == 0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr1,
                                                   resultKey='v1',
                                                   timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind %v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr2,
                                                   resultKey='v2',
                                                   timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ %v #1))'
                # execExpr1 = '(+ %v #1)'
                # add to itself?
                execExpr1 = '(+ %v %v)'
                funs = '[(def anon {v} %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   funs,
                                                   resultKey=None,
                                                   timeoutSecs=5,
                                                   doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon %v2))'
                execExpr2 = '(= !v2 (+ %v2 #1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr2,
                                                   resultKey='v2',
                                                   timeoutSecs=15)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1 == 0:
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(
                        inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s",
                                        (k, storeView['keys']))
            else:
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Example #27
0
    def test_rapids_rbind(self):

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        # for trial in range(int(1e6),int(200e6),int(1e6)):
        ROWS = int(100e6)
        for trial in [ROWS]:
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            xList = []
            eList = []
            fList = []
            # gets out of memory error if we rbind too much
            for trial2 in range(1, 8, 2):
            # for trial2 in range(0, 10):
            # fails. Post size?
            # for trial2 in range(0, 16):
                rows = ROWS * trial2
             
                vString = ' '.join(['%v' for x in range(trial2)])
                execExpr = '(= !v2 (rbind %s))' % vString

                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40)
                elapsed2 = time.time() - start

                if execResult['num_rows']:
                    keys.append(execExpr)
                
                if 1==1:
                    start = time.time()
                    execExpr = '(sum %v2 %TRUE)'
                    execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
                    elapsed1 = time.time() - start

                # xList.append(length)
                xList.append(rows)
                eList.append(elapsed1)
                fList.append(elapsed2)


        if 1==1:
            xLabel = 'rows'
            eLabel = 'elapsed (sum)'
            fLabel = 'elapsed (rbind)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_parse_multi_header_single(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
            ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True)
                print f

            # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node?
            # frames = h2o.nodes[0].frames()['frames']
            frames = h2o.n0.frames()['frames']
            frames_dict = h2o_util.list_to_dict(frames, 'key/name')

            # print "frames:", dump_json(frames)
            # print "frames_dict:", dump_json(frames_dict)

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception("Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()

            # does h2o-dev take a regex? or do we need to glob
            parseResult = h2i.parse_only(pattern='*'+rowxcol+'*',
                hex_key=hex_key, timeoutSecs=timeoutSecs, check_header="1") # header_from_file=header

            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols)
            print pA.numRows
            print pA.numCols
            print pA.parse_key

            expectedLabelList = headerData.split(",")
            iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols,
                expectedMissinglist=[], expectedLabelList=expectedLabelList)

            if DO_RF:
                # put in an ignore param, that will fail unless headers were parsed correctly
                if HEADER:
                    kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_columns': "['ID','CAPSULE']"}
                else:
                    kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1}

                rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            h2o.check_sandbox_for_errors()
    def test_rapids_vec_fail(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6),int(8e6),int(1e6)):
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial

            execExpr = '(= !vreal (c {(: #0 #%s)})' % (length - 1)
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # change it to all 1s? v = v==0
            execExpr = '(= !vint (N %vreal #0))'
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

            # comparing the sum times for int vs real..maybe the other guy isn't real. at least: different compression
            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'


            # recursively expand
            execExpr = '(= !v2 (+ %vint <patt>))'
            for j in range(3):
                execExpr = re.sub('<patt>', '(+ %vint <patt>)', execExpr)
            # last one
            execExpr = re.sub('<patt>', '(+ %vint %vint)', execExpr)

            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed2 = time.time() - start

            execExpr = '(= !v1 (+ %vreal %vreal))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start

            inspectResult = h2o_cmd.runInspect(key='vreal')
            h2o_cmd.infoFromInspect(inspectResult)

            inspectResult = h2o_cmd.runInspect(key='vint')
            h2o_cmd.infoFromInspect(inspectResult)

            summaryResult = h2o_cmd.runSummary(key='vreal')

            if execResult['num_rows']:
                keys.append(execExpr)
            
            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)


        if 1==1:
            xLabel = 'vector length'
            eLabel = 'elapsed (v1 = vint + vint)'
            fLabel = 'elapsed (v2 = vreal + vreal)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Example #30
0
 def tearDown(self):
     h2o.check_sandbox_for_errors()
Example #31
0
    def test_exec2_xorsum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
                assert parse_key == hex_key
                assert numCols == colCount
                assert numRows == rowCount

                inspect = h2o_cmd.runInspect(key=hex_key)
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert len(missingList) == 0

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
        
                        if 1==0:
                            execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30)
                            fpResult = execResult['scalar']
                        else:
                            (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300)
                            # print dump_json(h2o.n0.frames(key="h"))

                        # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300)
                        # print dump_json(h2o.n0.frames(key="r1"))
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()

                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult!=expectedUllSum:
                            raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \
                                (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \
                                (ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
 def tearDown(self):
     h2o.nodes[0].log_download()
     h2o.check_sandbox_for_errors()
    def test_xl_ast_assert_X(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1')
        checkAst(astForInit(a))
        b = DF('b1')
        checkAst(astForInit(b))
        c = DF('c1')
        checkAst(astForInit(c))
        # look at our secret stash in the base class. Should see the DFInit?
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        # DF does a kv store init. Key doesn't
        # DF inherits from Key. KeyIndexed inherits from Key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)

        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        Assign(a, 2)
        checkAst("(= !a1 #2)")
        Assign(b, 2)
        checkAst("(= !b1 #2)")
        Assign(c, 2)
        checkAst("(= !c1 #2)")

        # - doesn't exist? multiply by -1?
        Assign(c, ~c)
        checkAst("(= !c1 (^ %c1 #1))") # not right if more than 1 col?
        Assign(c, -c)
        checkAst("(= !c1 (_ %c1))")
        Assign(c, abs(c))
        checkAst("(= !c1 (abs %c1))")

        # this needs to be an h2o int? because it expects int return
        # Assign(c, int(c))
        # checkAst("(= !c1 (trunc c1 ))")

        Assign(a, [0])
        checkAst("(= !a1 (c {#0}))")
        Assign(b, [0,1])
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, [0,1,2])
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(a, (0,)) # make sure it's a tuple with comma
        checkAst("(= !a1 (c {#0}))")
        Assign(b, (0,1))
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, (0,1,2))
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(c, a[0] + b[1])
        checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        Assign(c[0], (a[0] + b[1]))
        checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Example #34
0
    def test_xl_basic(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexDF = 'v'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexDF)


        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1') # knon_* key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)
        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        # look at our secret stash in the base class. Should see the DFInit?
        print "Does the lastExecResult stash work?", dump_json(h2o_xl.Xbase.lastExecResult)
        # this should work if str(DF) returns DF.frame
        inspect = h2o_cmd.runInspect(key=a)
        # print "inspect a", dump_json(inspect)

        b = DF('b1')
        assert isinstance(b, DF)
        inspect = h2o_cmd.runInspect(key=b)
        # print "inspect b", dump_json(inspect)

        Assign(a, [0,0,0])
        assert isinstance(a, Key)
        b <<= [0,0,0]
        assert isinstance(b, Key)
        # FIX! how come I have to create c here first for python
        # see here
        # http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python
        # is it too much to require c to exist first?
        # c = DF()
        # c <<= a + b

        # this will trigger ok?
        c = DF('c1')
        c <<= [0,0,0]
        assert isinstance(c, Key)
        # c[0] <<= a + b
        # Assign(lhs=c[0], rhs=(a + b))
        rhs = a + b
        Assign(c, rhs)
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= !c1 (+ %a1 %b1))"
        assert ast==astExpected, "Actual: %s    Expected: %s" % (ast, astExpected)

        rhs = a[0] + b[0]
        Assign(c[0], rhs)
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #0 #0)))"
        assert ast==astExpected, "Actual: %s    Expected: %s" % (ast, astExpected)

        Assign(c[1], (a[2] + b[2]))
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))"
        assert ast==astExpected, "Actual: %s    Expected: %s" % (ast, astExpected)

        # assert ast = "(= !b1 (is.na (c {#0})))"

        assert isinstance(c, Key), type(c)

        inspect = h2o_cmd.runInspect(key=c)
        # # print "inspect c", dump_json(inspect)

        # DF inits the frame
        # if you just want an existing Key, say existing=True
        a = DF('a2') # named data frame
        assert isinstance(a, DF)
        b = DF('b2')
        c = DF('c2')
        inspect = h2o_cmd.runInspect(key=c)
        # # print "inspect c", dump_json(inspect)

        a <<= 3
        b <<= 3
        c <<= 3
        c[0] <<= a[0] + b[0]
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        a = DF('a3') # named data frame
        b = DF('b3')
        c = DF('c3')
        a <<= 4
        b <<= 4
        c <<= 4

        c[0] <<= a[0] - b[0]
        assert isinstance(c, Key)
        c[0] <<= a[0] * b[0]
        assert isinstance(c, Key)

        a = DF('a4') # named data frame
        b = DF('b4')
        c = DF('c4')
        a <<= 5
        b <<= 5
        c <<= 5
        c[0] <<= (a[0] - b[0])
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        c[0] <<= (a[0] & b[0]) | a[0]
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()