Beispiel #1
0
    def test_rapids_mean(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 5, 'cA', 200),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            data_key = hex_key
            data_key2 = hex_key + "_2"
            for trial in range(4):
                result_key = data_key + "_" + str(trial)
                # copy the key
                Assign(data_key2, data_key)
                Assign(result_key,
                       Fcn('mean', KeyIndexed(data_key2, col=0), 0, False))
                trial += 1
    def test_rapids_funs_basic3(self):
        DO_FAIL = False
        if DO_FAIL:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
        else:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []

        # works for 1 pass..why is execExpr set for 2nd pass? should be new instance?
        # if we reuse the same object in the list, it has state?
        # do we need to copy the object...hmm
        for trial in range(1):
            for execObj in funsList:
                freshObj = copy(execObj)
                result = freshObj.do()
                # rapids doesn't like complicated params right now?
                if DO_FAIL:
                    a = Assign('junk',
                               Fcn('anon', KeyIndexed('r1', col=0)),
                               do=False)
                else:
                    a = Assign('junk', Fcn('anon', 'r1'), do=False)
                result = a.do(timeoutSecs=60)

                # rows might be zero!
                if a.execResult['num_rows'] or a.execResult['num_cols']:
                    keys.append(a.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_funs_basic3(self):
        DO_FAIL = False
        if DO_FAIL:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
        else:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []

        # works for 1 pass..why is execExpr set for 2nd pass? should be new instance?
        # if we reuse the same object in the list, it has state?
        # do we need to copy the object...hmm
        for trial in range(1):
            for execObj in funsList:
                freshObj = copy(execObj)
                result = freshObj.do()
                # rapids doesn't like complicated params right now?
                if DO_FAIL:
                    a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0)), do=False)
                else:
                    a = Assign('junk', Fcn('anon', 'r1'), do=False)
                result = a.do(timeoutSecs=60)

                # rows might be zero!
                if a.execResult['num_rows'] or a.execResult['num_cols']:
                    keys.append(a.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_funs_1000_stmnt(self):
        DO_FAIL = False
        if DO_FAIL:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
        else:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []

        for trial in range(3):
            for execObj in funsList:
                freshObj = copy(execObj)
                print "ast length:", len(str(freshObj))
                result = freshObj.do()

                # rapids doesn't like complicated params right now?
                if DO_FAIL:
                    a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0)))
                else:
                    a = Assign('junk', Fcn('anon', 'r1'))
                result = a.do(timeoutSecs=60)

                # rows might be zero!
                if a.execResult['num_rows'] or a.execResult['num_cols']:
                    keys.append(a.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Beispiel #5
0
    def test_xl_oobe(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1')  # knon_* key
        b = DF('b1')
        c = DF('c1')
        # look at our secret stash in the base class. Should see the DFInit?

        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)
        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        Assign(a, range(5))
        Assign(b, range(5))
        Assign(c, range(5))
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        # print "Referring to non-existent rows causes a problem (AAIOBE)"
        # not any more..change it to legal case
        Assign(c[1], (a[2] + b[2]))
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Beispiel #6
0
    def test_GLM_params_rand2(self):
        importFolderPath = "covtype"
        csvFilename = "covtype.20k.data"
        hex_key = "covtype20k.hex"
        binomial_key = "covtype20k.b.hex"
        b = Key(hex_key)
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        ## columnTypeDict = {54: 'Enum'}
        columnTypeDict = None
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=binomial_key,
                                       columnTypeDict=columnTypeDict,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        Assign(b[:, 54], b[:, 54] - 1)
        # make 1 thru 6 go to 1
        Assign(b[:, 54], b[:, 54] != 0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        numColsUsed = numCols

        paramDict = define_params()
        for trial in range(5):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?

            # params is mutable. This is default.
            parameters = {
                'response_column': 'C55',
                'alpha': 0.1,
                # 'lambda': 1e-4,
                'lambda': 0,
            }
            h2o_glm.pickRandGlmParams(paramDict, parameters)

            if 'family' not in parameters or parameters['family'] == 'binomial':
                bHack = binomial_key
            else:
                bHack = hex_key

            co = h2o_cmd.runSummary(key=binomial_key, column=54)
            print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)
            co = h2o_cmd.runSummary(key=hex_key, column=54)
            print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)

            # fix stupid params
            fixList = [
                'alpha', 'lambda', 'ignored_columns', 'class_sampling_factors'
            ]
            for f in fixList:
                if f in parameters:
                    parameters[f] = "[%s]" % parameters[f]

            model_key = 'rand_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=bHack,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self,
                                   model,
                                   parameters,
                                   labelList,
                                   labelListUsed,
                                   allowNaN=True)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            # FIX! when is this legal
            doClassification = False
            if doClassification:
                mcms = OutputObj(
                    {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
                m1 = mcms.data[1:]
                h0 = mcms.data[0]
                print "\nmcms", tabulate(m1, headers=h0)

            if doClassification:
                thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

                if 1 == 0:
                    print ""
                    for i, c in enumerate(cmms.cm):
                        print "\ncmms.cm[%s]" % i, tabulate(c)
                    print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i
# '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)',

from h2o_xl import Def, Fcn, Assign, KeyIndexed
from copy import copy

print "Trying a different way, listing Rapids objects, rather than .ast() strings"

# 'c' allowed
# should be able to take a list of statements
keyString = 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz'
keyString += 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz'
keyString += 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz'
funsList = [
    Def('anon', 'x', 
        [Assign(key, Fcn('var', 'x', None, False, None), do=False) for key in keyString],
        
        [Assign(key, Fcn('sum', KeyIndexed('x',col=0), False), do=False) for key in keyString],
        [Assign(key, Fcn('max', KeyIndexed('x',col=0), False), do=False) for key in keyString],
        [Assign(key, Fcn('min', KeyIndexed('x',col=0), False), do=False) for key in keyString],
        [Assign(key, Fcn('xorsum', KeyIndexed('x',col=0), False), do=False) for key in keyString],

        [Assign(key, Fcn('sd', KeyIndexed('x',col=0), False), do=False) for key in keyString],
        [Assign(key, Fcn('ncol', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('is.factor', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('any.factor', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('length', KeyIndexed('x',col=0)), do=False) for key in keyString],

        [Assign(key, Fcn('sin', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('asin', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('sinh', KeyIndexed('x',col=0)), do=False) for key in keyString],
    def test_GLM_error1(self):
        importFolderPath = "covtype"
        csvFilename = "covtype.20k.data"
        hex_key = "covtype20k.hex"
        binomial_key = "covtype20k.b.hex"
        b = Key(hex_key)
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        ## columnTypeDict = {54: 'Enum'}
        columnTypeDict = None
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=binomial_key,
                                       columnTypeDict=columnTypeDict,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        Assign(b[:, 54], b[:, 54] - 1)
        # make 1 thru 6 go to 1
        Assign(b[:, 54], b[:, 54] != 0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        numColsUsed = numCols

        for trial in range(5):
            parameters = {
                'response_column': 'C55',
                'max_iterations': 3,
                'solver': 'L_BFGS',
                'ignored_columns': '["C1"]',
                'alpha': '[0.1]',
                'max_after_balance_size': 1000.0,
                'class_sampling_factors': '[0.2]',
                # 'use_all_factor_levels': None,
                'lambda': '[0]',
            }

            bHack = hex_key

            co = h2o_cmd.runSummary(key=binomial_key, column=54)
            print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)
            co = h2o_cmd.runSummary(key=hex_key, column=54)
            print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)

            model_key = 'rand_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=bHack,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self,
                                   model,
                                   parameters,
                                   labelList,
                                   labelListUsed,
                                   allowNaN=True)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            # FIX! when is this legal
            doClassification = False
            if doClassification:
                mcms = OutputObj(
                    {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
                m1 = mcms.data[1:]
                h0 = mcms.data[0]
                print "\nmcms", tabulate(m1, headers=h0)

            if doClassification:
                thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

                if 1 == 0:
                    print ""
                    for i, c in enumerate(cmms.cm):
                        print "\ncmms.cm[%s]" % i, tabulate(c)
                    print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
    def test_xl_ast_assert_ZZ(self):
        #*****************************************
        a = DF('a1') # inits to -1
        checkAst(astForInit(a))
        # I suppose use of the h2o inspect request is deprecated
        # h2o_cmd.runInspect uses Frames?
        if 1==0:
            inspect = h2o.n0.inspect(key=a) # str(a) becomes 'a1'. so this param should take type Key for key=
            print "a/a1:", dump_json(inspect)

        # let's use runSummary for fun..returns OutputObj for the col
        # will get from column 0, since column not specified
        summaryResult = h2o_cmd.runSummary(key=a)
        co = h2o_cmd.infoFromSummary(summaryResult)
        print "co.label:", co.label
        print "co.data:", co.data

        # how can we get a bunch of data?
        b = DF('b1') # inits to -1
        checkAst(astForInit(b))
        c = DF('c1') # inits to -1
        checkAst(astForInit(c))
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        h2p.yellow_print("Assign compare1")
        Assign(c[0], c[0] + 0)
        checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare2")
        Assign(c[0], c[0] - 0)
        checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare3")
        Assign(c[0], c[0] == 0)
        checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare4")
        Assign(c[0], c[0] != 0)
        checkAst("(= ([ %c1 #0 #0) (N ([ %c1 #0 #0) #0))")

        # h2o_xl.debugPrintEnable = True

        #*****************************************
        c = DF('c1')

        h2p.yellow_print("<<= compare1")
        c[0] <<= (c[0] + 0)
        checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))")

        h2p.yellow_print("<<= compare2")
        c[0] <<= (c[0] - 0)
        checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))")

        h2p.yellow_print("<<= compare3")
        c[0] <<= (c[0] == 0)
        checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))")

        #*****************************************
        c = DF('c1') # inits to -1
        h2p.yellow_print("compare1")
        # doesn't assign result to a key?, gets result if scalar, otherwise gets a list or ??? 
        # .result can give us scalar, list, Key, None

        # .result could be a property that triggers a csv download, if we didn't cache the scalar/list result because it was small?
        # i.e. check if .result_cached was None, when .result property is used (property to avoid the need for ()
        result = Expr(c[0] == -1).result
        checkAst("(n ([ %c1 #0 #0) #-1)")
        h2p.yellow_print("Expr result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result))
        assert result == 1.0, "%s %s" % (type(result), result) # real result?

        if result:
            print "true for if of result", type(result), result
        else:
            print "else for if of result", type(result), result

        #*****************************************
        # difference is this goes to a temp key, so if not scalar, you can still get the results by looking at the key
        result = Assign(None, c[0]==-1).result
        checkAst("(= !knon_0x1a34250 (n ([ %c1 #0 #0) #-1))")
        h2p.yellow_print("Assign result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result))
        assert result == 1.0, "%s %s" % (type(result), result) # real result?

        if result:
            print "true if of result", result
        else:
            print "false if of result", result
    def test_xl_ast_assert_X(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1')
        checkAst(astForInit(a))
        b = DF('b1')
        checkAst(astForInit(b))
        c = DF('c1')
        checkAst(astForInit(c))
        # look at our secret stash in the base class. Should see the DFInit?
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        # DF does a kv store init. Key doesn't
        # DF inherits from Key. KeyIndexed inherits from Key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)

        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        Assign(a, 2)
        checkAst("(= !a1 #2)")
        Assign(b, 2)
        checkAst("(= !b1 #2)")
        Assign(c, 2)
        checkAst("(= !c1 #2)")

        # - doesn't exist? multiply by -1?
        Assign(c, ~c)
        checkAst("(= !c1 (^ %c1 #1))") # not right if more than 1 col?
        Assign(c, -c)
        checkAst("(= !c1 (_ %c1))")
        Assign(c, abs(c))
        checkAst("(= !c1 (abs %c1))")

        # this needs to be an h2o int? because it expects int return
        # Assign(c, int(c))
        # checkAst("(= !c1 (trunc c1 ))")

        Assign(a, [0])
        checkAst("(= !a1 (c {#0}))")
        Assign(b, [0,1])
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, [0,1,2])
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(a, (0,)) # make sure it's a tuple with comma
        checkAst("(= !a1 (c {#0}))")
        Assign(b, (0,1))
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, (0,1,2))
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(c, a[0] + b[1])
        checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        Assign(c[0], (a[0] + b[1]))
        checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
    def test_rapids_overloaded_opr(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (1000000, 5, 'cA', 200),
            (1000, 5, 'cA', 200),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            # Xbase.debugOnly = True

            REPEAT = 1
            data_key = hex_key
            for i in range(REPEAT):
                result_key = data_key + "_" + str(i)
                Assign('s1', Seq(range(5)))

                # take advantage of default params for row/col (None)
                # need the 'c' function, to make sure the key is created

                # first try as object, then method
                Assign('s2', Fcn('c', Seq(range(5))))

                # just combine
                Assign('s3', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s3')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                Assign('s2', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                # can't have sequence of sequences?
                # make sure key is created with c()
                f = Fcn(
                    'c',
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s1', f)

                f = Col(
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s2', f)

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 313
                assert numCols == 1

                print "Now trying to do the functions with the alternate overloaded operators"
                data_key = Key(parse_key)
                result_key = Key()
                # what triggers immediate operation at h2o
                # as opposed to an object within a function

                result_key.frame = 'a1'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a2'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a3'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a4'
                result_key <<= data_key[Seq(range(1, 4)), 0:1]
                result_key.frame = 'a5'
                result_key <<= data_key[Seq(range(1, 4)), 0:1]

                result_key.frame = 'a6'
                result_key <<= data_key[[1, 2, 3], 1]

                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)
    def test_xl_real(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexDF = 'v'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexDF)

        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1')  # knon_* key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)
        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        # look at our secret stash in the base class. Should see the DFInit?
        print "Does the lastExecResult stash work?", dump_json(
            h2o_xl.Xbase.lastExecResult)
        # this should work if str(DF) returns DF.frame
        inspect = h2o_cmd.runInspect(key=a)
        # print "inspect a", dump_json(inspect)

        b = DF('b1')
        assert isinstance(b, DF)
        inspect = h2o_cmd.runInspect(key=b)
        # print "inspect b", dump_json(inspect)

        Assign(a, [0.0, 1.0, 2.0])
        assert isinstance(a, Key)
        b <<= [3.1, 4.1, 5.1]
        assert isinstance(b, Key)
        # FIX! how come I have to create c here first for python
        # see here
        # http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python
        # is it too much to require c to exist first?
        # c = DF()
        # c <<= a + b

        # this will trigger ok?
        c = DF('c1')
        c <<= [6.2, 7.2, 8.2]
        assert isinstance(c, Key)
        # c[0] <<= a + b
        # Assign(lhs=c[0], rhs=(a + b))
        rhs = a + b
        Assign(c, rhs)
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= !c1 (+ %a1 %b1))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        rhs = a[0] + b[0]
        Assign(c[0], rhs)
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #0 #0)))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        Assign(c[1], (a[2] + b[2]))
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        # assert ast = "(= !b1 (is.na (c {#0})))"

        assert isinstance(c, Key), type(c)

        inspect = h2o_cmd.runInspect(key=c)
        # # print "inspect c", dump_json(inspect)

        # DF inits the frame
        # if you just want an existing Key, say existing=True
        a = DF('a2')  # named data frame
        assert isinstance(a, DF)
        b = DF('b2')
        c = DF('c2')
        inspect = h2o_cmd.runInspect(key=c)
        # # print "inspect c", dump_json(inspect)

        a <<= 3
        b <<= 3
        c <<= 3
        c[0] <<= a[0] + b[0]
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        a = DF('a3')  # named data frame
        b = DF('b3')
        c = DF('c3')
        a <<= 4
        b <<= 4
        c <<= 4

        c[0] <<= a[0] - b[0]
        assert isinstance(c, Key)
        c[0] <<= a[0] * b[0]
        assert isinstance(c, Key)

        a = DF('a4')  # named data frame
        b = DF('b4')
        c = DF('c4')
        a <<= 5
        b <<= 5
        c <<= 5
        c[0] <<= (a[0] - b[0])
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        c[0] <<= (a[0] & b[0]) | a[0]
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Beispiel #13
0
    def test_xl_seq_A(self):
        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1') # knon_* key
        b = DF('b1')
        c = DF('c1')
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)
        # look at our secret stash in the base class. Should see the DFInit?

        # DF does a kv store init. Key doesn't
        # DF inherits from Key. KeyIndexed inherits from Key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)

        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        assert isinstance(a, Key)
        assert isinstance(b, Key)
        assert isinstance(c, Key)

        Assign(a, 0)
        checkAst("(= !a1 #0)")
        Assign(b, 0)
        checkAst("(= !b1 #0)")
        Assign(c, 0)
        checkAst("(= !c1 #0)")

        Assign(a, [0])
        checkAst("(= !a1 (c {#0}))")
        Assign(b, [0,1])
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, [0,1,2])
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(a, (0,)) # make sure it's a tuple with comma
        checkAst("(= !a1 (c {#0}))")
        Assign(b, (0,1))
        checkAst("(= !b1 (c {#0;#1}))")
        Assign(c, (0,1,2))
        checkAst("(= !c1 (c {#0;#1;#2}))")

        Assign(c, a[0] + b[1])
        checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        Assign(c[0], (a[0] + b[1]))
        checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))")

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Beispiel #14
0
import unittest, random, sys, time
sys.path.extend(['.','..','../..','py'])
import h2o2 as h2o
import h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i
# '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)',

from h2o_xl import Def, Fcn, Assign, KeyIndexed
from copy import copy, deepcopy

print "Trying a different way, listing Rapids objects, rather than .ast() strings"

# 'c' allowed
# should be able to take a list of statements
funsList = [
    Def('anon', 'x', 
        Assign('a', Fcn('var', 'x', None, False, None), do=False),
        Assign('b', Fcn('var', 'x', None, False, None), do=False),
        Assign('d', Fcn('var', 'x', None, False, None), do=False),
        Assign('e', Fcn('var', 'x', None, False, None), do=False),
        Assign('f', Fcn('var', 'x', None, False, None), do=False),
        Assign('g', Fcn('var', 'x', None, False, None), do=False),
        Assign('d', Fcn('var', 'x', None, False, None), do=False),
        Assign('i', Fcn('var', 'x', None, False, None), do=False),
        Assign('j', Fcn('var', 'x', None, False, None), do=False),
        Assign('k', Fcn('var', 'x', None, False, None), do=False),
        Assign('l', Fcn('var', 'x', None, False, None), do=False),
        Assign('m', Fcn('var', 'x', None, False, None), do=False),
        Assign('n', Fcn('var', 'x', None, False, None), do=False),
        Assign('o', Fcn('var', 'x', None, False, None), do=False),
        Assign('p', Fcn('var', 'x', None, False, None), do=False),
        Assign('q', Fcn('var', 'x', None, False, None), do=False),
    def test_rapids_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 5, 'cA', 200),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            REPEAT = 1
            data_key = hex_key
            for i in range(REPEAT):
                result_key = data_key + "_" + str(i)

                Assign('seq1', Seq(range(5)))
                # take advantage of default params for row/col (None)
                # need the 'c' function, to make sure the key is created

                Assign('seq2', Fcn('c', Seq(range(5))))
                inspect = h2o_cmd.runInspect(key='seq1')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)

                Assign('seq3', Col(Seq(range(5))))
                inspect = h2o_cmd.runInspect(key='seq2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)

                # can't have sequence of sequences?
                # make sure key is created with c()
                Assign(
                    'seq4',
                    Fcn(
                        'c',
                        Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                            range(50, 52))))

                inspect = h2o_cmd.runInspect(key='seq1')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)

                Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5))))
                Assign(
                    'seq5',
                    KeyIndexed(data_key,
                               row=Seq(Colon(99, 400), "#2", 1, range(1, 5))))

                # they need to be same size
                # Assign('seq6', Key('seq5') + Key('seq4') + Key('seq3'))

                # doesn't like my cut? complains on FALSE
                # Assign(result_key, Cut(KeyIndexed(data_key, col=0)))
                # Assign(result_key, Cut(KeyIndexed(data_key, col=1), breaks=3))

                Assign(result_key, Fcn('min', KeyIndexed(data_key, col=1),
                                       True))
                Assign(result_key, Fcn('max', KeyIndexed(data_key, col=1),
                                       True))
                Assign(result_key,
                       Fcn('mean', KeyIndexed(data_key, col=1), 0, False))

                Assign(result_key, KeyIndexed(data_key, row='#1'))
                Assign(result_key, KeyIndexed(data_key,
                                              row=Colon('#1', '#100')))
                Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100)))
                # this should fail rapids because of reverse msb/lsb
                # illegal, detected
                # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1')))
                Assign(result_key, KeyIndexed(data_key,
                                              row=Colon('#-2', '#-1')))
                Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1)))
                # illegal, detected
                # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2')))
                # take advantage of number to string conversion
                Assign(result_key,
                       KeyIndexed(data_key, row=Colon('#1', rowCount - 10)))
                Assign(result_key,
                       KeyIndexed(data_key, col=Colon(
                           '#1',
                           colCount - 1,
                       )))

                # no assign. Expr() complains when result has no key?
                Assign(result_key,
                       KeyIndexed(data_key, row=Colon('#1', rowCount - 10)))
                Assign(result_key,
                       KeyIndexed(data_key, col=Colon(
                           '#1',
                           colCount - 1,
                       )))

                # do some function translation
                Assign(
                    result_key,
                    Fcn('==', 1,
                        KeyIndexed(data_key, col=Colon(
                            '#1',
                            colCount - 1,
                        ))))

                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)
    def test_exec2_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            (n, 10, 9, 'cE', 300),
        ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount),
                                     random.randint(1, iColCount))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings

                    # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                    # celChoice = str(random.choice(range(len(cel))))
                    celChoice = random.choice(range(len(cel)))
                    cutValue[c] = celChoice

                cutExprList = []

                pKey = Key('p')
                for i, c in enumerate(cutValue):
                    if c is None:
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # cutExprList.append('p$C'+str(i+1)+'=='+c)
                        # all column indexing in h2o-dev is with number
                        e = Fcn('==', c, pKey[:, i])
                        cutExprList.append(e)

                cutExpr = None
                for ce in cutExprList:
                    if cutExpr:
                        cutExpr = Fcn('&', cutExpr, ce)
                    else:
                        cutExpr = ce

                print "cutExpr:", cutExpr

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                # rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                hKey = Key(hex_key)
                rowExpr = hKey[cutExpr, :]

                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              iColCount,
                              oColCount,
                              SEEDPERFILE,
                              colEnumList=colEnumList)

            # PARSE*******************************************************
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)
            # print h2o.dump_json(inspect)

            # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            #    h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False)

            # error if any col has constant values
            # if len(constantValuesDict) != 0:
            #    raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # build up the columns
            Assign('b', [1, 2, 3])
            # could also append 1 col at a time, by assigning to the next col number?
            Assign('a', Cbind(['b' for i in range(colCount)]))

            for eKey in eKeys:
                Assign(eKey, 'a')
                ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0, iColCount - 1)
                randOCol = random.randint(iColCount, iColCount + oColCount - 1)

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                if 1 == 1:
                    start = time.time()
                    Assign(fKey, random.choice(rowExprList)).do()
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."

                    inspect = h2o_cmd.runInspect(key=fKey)
                    missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                        inspect)

                if numRows == 0 or numCols != colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows,
                                  "rows and", numCols,
                                  "cols. Quantile will abort")

                # FIX! put quantile back in?
                quantileTime = 0

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)

        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
import unittest, random, sys, time
sys.path.extend(['.', '..', '../..', 'py'])
import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i
# '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)',

from h2o_xl import Def, Fcn, Assign, KeyIndexed, If, IfElse, Return
from h2o_test import dump_json, verboseprint
from copy import copy

print "Trying a different way, listing Rapids objects, rather than .ast() strings"

# 'c' allowed
# should be able to take a list of statements
objList = [
    Assign('e', IfElse(1, 2, IfElse(4, 5, IfElse(7, 8, 9))), do=False),
    Assign('f', IfElse(1, 2, IfElse(4, 5, IfElse(7, 8, 9))), do=False),
    Assign('g', IfElse(0, 2, IfElse(0, 5, IfElse(0, 8, 9))), do=False),
    Def('ms', 'x', [
        IfElse(0, 2, IfElse(0, 5, IfElse(0, 8, 9))),
        Assign('k', IfElse(0, 12, IfElse(0, 15, IfElse(0, 18, 19))), do=False),
    ]),
    Assign('e', Fcn('ms', 2), do=False),
    Def('ms', 'x', [
        If(0, Return(3)),
        IfElse(0, 5, IfElse(0, 8, 9)),
        Assign('k', IfElse(0, 12, IfElse(0, 15, IfElse(0, 18, 19))), do=False),
        If(1, Return(2)),
    ]),
    Assign('e', Fcn('ms', 2), do=False),
]
Beispiel #18
0
    def test_rapids_funs_1op(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (1000000, 5, 'cA', 200),
            (1000, 5, 'cA', 200),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(numCols, colCount,
                "parse created result with the wrong number of cols %s %s" % (numCols, colCount))
            self.assertEqual(numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" % (numRows, rowCount))

            # Xbase.debugOnly = True

            REPEAT = 1
            data_key = hex_key
            data_key2 = hex_key + "_2"
            trial = 0
            good = []
            bad = []
            both = h2o_xl.xFcnOp1Set.union(h2o_xl.xFcnOp3Set)
            both = h2o_xl.xFcnOp1Set
            for fun in both:

                a = None
                try:
                    result_key = data_key + "_" + str(trial)
                    # copy the key
                    Assign(data_key2, data_key)

                    # a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True))

                    # a = Assign(result_key, Fcn('sum', KeyIndexed(data_key2, col=0), True))
                    # a = Assign(result_key, Fcn('xorsum', KeyIndexed(data_key2, col=0), True))
                    # a = Assign(result_key, Fcn('sqrt', KeyIndexed(data_key2, col=0)))
                    # a = Assign(result_key, Fcn('ncol', KeyIndexed(data_key2, col=0)))

                    # what's wrong with mean?
                    if fun in ['ncol', 'asin', 'any.factor', 'sin', 'atan', 'tan', 'sign', 'log', 'exp', 'sqrt', 'abs', 'floor', 'ceiling', 'trunc','is.factor', 'is.na', 'any.na', 'nrow', 'tanh', 'length', 'acos', 'cos', 'sinh', 'cosh']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0)))
                        good.append(fun)
                    elif fun in ['sum', 'max', 'min', 'xorsum', 'sd']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True))
                        good.append(fun)
                    elif fun in ['scale']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False))
                        good.append(fun)
                    elif fun in ['round', 'signif']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1))
                        good.append(fun)
                    elif fun in ['seq_len', 'rep_len']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 4))
                        good.append(fun)
                    elif fun in ['seq']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1, 5, 1))
                        good.append(fun)
                    elif fun in ['mean']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 0, False))
                        good.append(fun)
                    elif fun in ['var']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False, False))
                        good.append(fun)
                    elif fun in ['match']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), KeyIndexed(data_key2, col=0), 1, None))
                        good.append(fun)
                    elif fun in ['unique']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, 10, 1))
                        good.append(fun)
                    else:
                        # bad functions kill h2o?
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), None))
                        bad.append(fun)

                        # a = Fcn(fun, KeyIndexed(data_key, col=0), '%FALSE ')
                        # a = Fcn(fun, data_key, '%FALSE')
                        # a = Fcn(fun, data_key)

                    # scalars?
                    if 1==0:
                        inspect = h2o_cmd.runInspect(key=result_key)
                        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                        assert numRows==1000, numRows
                        assert numCols==1, numCols

                        print "\n" + csvPathname, \
                            "    numRows:", "{:,}".format(numRows), \
                            "    numCols:", "{:,}".format(numCols)

                except: 
                    if not a:
                        # print dump_json(a.execResult)
                        bad.append(fun)

                trial += 1

            print "good:", good
            print "bad:", bad
    def test_rapids_row_range(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (1000000, 5, 'cA', 200),
            (1000, 5, 'cA', 200),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            # Xbase.debugOnly = True

            REPEAT = 1
            data_key = hex_key
            for i in range(REPEAT):
                result_key = data_key + "_" + str(i)
                # Assign('s1', Seq(range(5)) ).do
                Assign('s1', Seq(range(5)))

                # take advantage of default params for row/col (None)
                # need the 'c' function, to make sure the key is created

                # first try as object, then method
                Assign('s2', Fcn('c', Seq(range(5))))
                print dump_json(Xbase.lastExecResult)
                print dump_json(Xbase.lastResult)

                # just combine
                Assign('s3', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s3')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                Assign('s2', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                # can't have sequence of sequences?
                # make sure key is created with c()
                f = Fcn(
                    'c',
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s1', f)

                f = Col(
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s2', f)

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 313
                assert numCols == 1

                print "z1"
                Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5))))
                print "z2"
                Assign(
                    's1',
                    KeyIndexed(data_key,
                               row=Seq(Colon(99, 400), "#2", 1, range(1, 5))))

                print "z3"
                Assign(result_key, KeyIndexed(data_key, row='#1')).do
                print "z4"
                Assign(result_key, KeyIndexed(data_key,
                                              row=Colon('#1', '#100')))
                print "z5"
                Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100)))
                # this should fail rapids because of reverse msb/lsb
                # illegal, detected
                # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1')))
                print "z6"
                Assign(result_key, KeyIndexed(data_key,
                                              row=Colon('#-2', '#-1')))
                print "z7"
                Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1)))
                # illegal, detected
                # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2')))
                # take advantage of number to string conversion
                print "z8"
                Assign(result_key,
                       KeyIndexed(data_key, row=Colon('#1', rowCount - 10)))
                print "z9"
                Assign(result_key,
                       KeyIndexed(data_key, col=Colon(
                           '#1',
                           colCount - 1,
                       )))

                # no assign
                print "z10"
                result = KeyIndexed(data_key, row=Colon('#1',
                                                        rowCount - 10)).do()
                print "z11"
                # result = KeyIndexed(data_key, col=Colon('#1', colCount-1,)).do()

                # do some function translation
                print "z12"
                # result = Fcn('==', 1, KeyIndexed(data_key, col=Colon('#1', colCount-1,))).do()

                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)
sys.path.extend(['.', '..', '../..', 'py'])
import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i
# '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)',

from h2o_xl import Def, Fcn, Assign, KeyIndexed
from copy import copy, deepcopy

print "Trying a different way, listing Rapids objects, rather than .ast() strings"

# 'c' allowed
# should be able to take a list of statements
funsList = [
    Def(
        'anon',
        'x',
        Assign('a', Fcn('var', 'x', None, False, None), do=False),
        Assign('b', Fcn('var', 'x', None, False, None), do=False),
        Assign('d', Fcn('var', 'x', None, False, None), do=False),
        Assign('e', Fcn('var', 'x', None, False, None), do=False),
        Assign('f', Fcn('var', 'x', None, False, None), do=False),
        Assign('g', Fcn('var', 'x', None, False, None), do=False),
        Assign('d', Fcn('var', 'x', None, False, None), do=False),
        Assign('i', Fcn('var', 'x', None, False, None), do=False),
        Assign('j', Fcn('var', 'x', None, False, None), do=False),
        Assign('k', Fcn('var', 'x', None, False, None), do=False),
        Assign('l', Fcn('var', 'x', None, False, None), do=False),
        Assign('m', Fcn('var', 'x', None, False, None), do=False),
        Assign('n', Fcn('var', 'x', None, False, None), do=False),
        Assign('o', Fcn('var', 'x', None, False, None), do=False),
        Assign('p', Fcn('var', 'x', None, False, None), do=False),
        Assign('q', Fcn('var', 'x', None, False, None), do=False),