def tryThemAll(self, set, rows, enumsOnly=False):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            if enumsOnly:
                tcd = self.tokenChangeDict
            else:
                tcd = self.tokenChangeDictEnumsOnly

            for tokenCase in range(len(tcd)):
                newRows1 = self.changeTokens(rows, tokenCase, tcd)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    if "'" in self.tokenChangeDict[tokenCase]:
                        single_quotes = 1
                    else:
                        single_quotes = 0
                    parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes,
                        noPrint=not h2o.verbose)

                    h2o_cmd.runRF(parseResult=parseResult, trees=1,
                        timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=True)
                    h2o.verboseprint("Set", set)
                    h2o.check_sandbox_for_errors()
                    sys.stdout.write('.')
                    sys.stdout.flush()
Example #2
0
    def test_C_hhp_107_01(self):
        csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
        print "\n" + csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # build up the parameter string in X
        y = "106"
        x = ""

        # go right to the big X and iterate on that case
        ### for trial in range(2):
        for trial in range(2):
            print "\nTrial #", trial, "start"
            print "\nx:", x
            print "y:", y

            start = time.time()
            kwargs = {'y': y}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs)
            h2o.check_sandbox_for_errors()
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")
            print "\nTrial #", trial
    def test_rapids_basic(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'p'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for execExpr in exprList:
            r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr)
            resultKey = r.group(1)
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
            if DO_ROLLUP:
                h2o_cmd.runInspect(key=resultKey)
            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr)
            else:
                h2p.yellow_print("\nNo key created?\n", dump_json(execResult))

        print "\nExpressions that created keys. Shouldn't all of these expressions create keys"

        for k in keys:
            print k

        h2o.check_sandbox_for_errors()
Example #4
0
    def test_sort_of_prostate_with_row_schmoo(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        rowData = "1,0,65,1,2,1,1.4,0,6"

        write_syn_dataset(csvPathname,      99860, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        print "Updating the key and key2 names for each trial"
        for trial in range (200):
            append_syn_dataset(csvPathname, rowData)
            ### start = time.time()
            # this was useful to cause failures early on. Not needed eventually
            ### key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate.csv"))
            ### print "Trial #", trial, "parse end on ", "prostate.csv" , 'took', time.time() - start, 'seconds'

            start = time.time()
            key = csvFilename + "_" + str(trial)
            key2 = csvFilename + "_" + str(trial) + ".hex"
            key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2)
            print "trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'

            h2o_cmd.runInspect(key=key2)
            # only used this for debug to look at parse (red last row) on failure
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
    def test_rapids_ifelse_nested(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for trial in range(2):
            for execObj, expected in zip(objList, resultList):
                freshObj = copy(execObj)
                result = freshObj.do()
                # do some scalar result checking
                if expected is not None:
                    # result is a string now??
                    print "result:", result
                    print "expected:", expected
                    assert float(result)==expected, "%s %s" (result,expected)

                # rows might be zero!
                print "freshObj:", dump_json(freshObj.execResult)
                if 'key' in freshObj.execResult and freshObj.execResult['key']:
                    keys.append(freshObj.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
def exec_list(exprList, lenNodes, csvFilename, key2):
        h2e.exec_zero_list(zeroList)
        # start with trial = 1 because trial-1 is used to point to Result0 which must be initted
        trial = 1
        while (trial < 100):
            for exprTemplate in exprList:
                # do each expression at a random node, to facilate key movement
                nodeX = random.randint(0,lenNodes-1)
                colX = random.randint(1,54)
                # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now
                row = str(random.randint(1,400000))

                execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, key2)
                execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, 
                    resultKey="Result"+str(trial)+".hex", timeoutSecs=60)

                eri0 = execResultInspect[0]
                eri1 = execResultInspect[1]
                columns = eri0.pop('cols')
                columnsDict = columns[0]
                print "\nexecResult columns[0]:", h2o.dump_json(columnsDict)
                print "\nexecResult [0]:", h2o.dump_json(eri0)
                print "\nexecResult [1] :", h2o.dump_json(eri1)
                
                min = columnsDict["min"]
                h2o.verboseprint("min: ", min, "trial:", trial)
                ### self.assertEqual(float(min), float(trial),"what can we check here")

                ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                # slows things down to check every iteration, but good for isolation
                h2o.check_sandbox_for_errors()
                print "Trial #", trial, "completed\n"
                trial += 1
    def test_GLM_params_rand2(self):
        # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k")

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Example #8
0
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, 
    csvSrcOutputPathname, csvPredictPathname, 
    skipSrcOutputHeader, skipPredictHeader,
    translate=None, y=0):
    # have to slice out col 0 (the output) and feed result to predict
    # cols are 0:784 (1 output plus 784 input features
    # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
    dataKey = "P.hex"
    h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
    if skipSrcOutputHeader:
        print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
        print "hack for now, can't chop out col 0 in Exec currently"
        dataKey = hex_key
    else:
        print "No header in dataset, can't chop out cols, since col numbers are used for names"
        dataKey = hex_key

    # +1 col index because R-like
    h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

    start = time.time()
    predict = h2o.nodes[0].generate_predictions(model_key=model_key,
        data_key=hex_key, destination_key=predictHexKey)
    print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds'
    h2o.check_sandbox_for_errors()
    inspect = h2o_cmd.runInspect(key=predictHexKey)
    h2o_cmd.infoFromInspect(inspect, 'predict.hex')

    h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
    h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
    h2o.check_sandbox_for_errors()

    print "Do a check of the original output col against predicted output"
    (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
        msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
    (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
        msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

    # no header on source
    if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
        raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
            %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

    wrong = 0
    for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
        # if float(o)!=float(p):
        if str(o)!=str(p):
            if wrong==10:
                print "Not printing any more mismatches\n"
            elif wrong<10:
                msg = "Comparing original output col vs predicted. row %s differs. \
                    original: %s predicted: %s"  % (rowNum, o, p)
                print msg
            wrong += 1

    print "\nTotal wrong:", wrong
    print "Total:", len(originalOutput)
    pctWrong = (100.0 * wrong)/len(originalOutput)
    print "wrong/Total * 100 ", pctWrong
    return pctWrong
Example #9
0
    def test_exec2_cbind_like_R(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()


        SEEDPERFILE = random.randint(0, sys.maxint)
        rowCount = 30000
        colCount = 150
        timeoutSecs = 60
        hex_key = "df"
        csvPathname = SYNDATASETS_DIR + "/" + "df.csv"
        write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
        parseResult = h2i.import_parse(path=csvPathname, schema='put', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False)

        colCount = 1
        hex_key = "indx"
        csvPathname = SYNDATASETS_DIR + "/" + "indx.csv"
        write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

        parseResult = h2i.import_parse(path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False)

        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']

        for trial in range(10):
            for execExpr in exprList:
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
                execTime = time.time() - start
                print 'exec took', execTime, 'seconds'

        h2o.check_sandbox_for_errors()
Example #10
0
    def test_exec2_covtype_cols(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=30)
        print "\nParse key is:", parseResult['destination_key']

        ### h2b.browseTheCloud()
        start = time.time()
        # passes with suffix, fails without?
        # suffix = ""
        suffix = ".hex"
        for k in range(54):
            # try the funky c(6) thing like  R, instead of just 6
            execExpr = "Result" + str(k) + suffix + " = c.hex[,c(" + str(k+1) + ")]"
            print "execExpr:", execExpr
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result" + str(k) + suffix, 
                timeoutSecs=4)
            for node in h2o.nodes:
                storeView = h2o_cmd.runStoreView(node=node, noPrint=True)
                numKeys = len(storeView['keys'])
                # number of keys should = k + 2? (on each node)
                self.assertEqual(k + 2, numKeys, "# of keys: %s on %s doesn't match expected: %s" % \
                    (numKeys, node, k + 2))
                    # (numKeys, node, k+2, h2o.dump_json(storeView)))

        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
    def test_parse_fs_schmoo_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        # rowData = "1,0,65,1,2,1,1.4,0,6"
        rowData = "1,0,65,1,2,1,1,0,6"

        totalRows = 99860
        write_syn_dataset(csvPathname, totalRows, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        print "Updating the key and hex_key names for each trial"
        for trial in range (200):
            append_syn_dataset(csvPathname, rowData)
            totalRows += 1

            start = time.time()
            key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "trial #", trial, "totalRows:", totalRows, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'

            h2o_cmd.runInspect(key=hex_key)
            # only used this for debug to look at parse (red last row) on failure
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Example #12
0
    def test_exec2_sum(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        if getpass.getuser()=='jenkins':
            csvPathname = 'standard/billion_rows.csv.gz'
        else:
            csvPathname = '1B/reals_100000x1000_15f.data'
            csvPathname = '1B/reals_1B_15f.data'
            csvPathname = '1B/reals_1000000x1000_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2)
        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']
        inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
        print "inspect offset = -1:", h2o.dump_json(inspect)

        for execExpr in exprList:
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
            print 'exec took', time.time() - start, 'seconds'
            print "result:", result

        h2o.check_sandbox_for_errors()
Example #13
0
    def test_exec2_operators(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        csvPathname = 'standard/covtype.data'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
        # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10)
        for (execExpr, num) in exprList:
            start = time.time()
            resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180)
            print h2o.dump_json(resultExec)
            print 'exec end took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key='a.hex')
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "numCols:", numCols
            print "numRows:", numRows
            self.assertEqual(numCols, 1)
            self.assertEqual(numRows, num)

            h2o.check_sandbox_for_errors()
Example #14
0
    def test_sort_of_prostate_with_row_schmoo(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"

        rowData = rand_rowData()
        write_syn_dataset(csvPathname, 1, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        for trial in range (100):

            rowData = rand_rowData()
            num = random.randint(1, 10096)
            append_syn_dataset(csvPathname, rowData, num)
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            key = csvFilename + "_" + str(trial)
            key2 = csvFilename + "_" + str(trial) + ".hex"
            key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2, 
                timeoutSecs=70, pollTimeoutSecs=60)
            print "trial #", trial, "with num rows:", num, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=key2)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Example #15
0
 def test_B_benign(self):
     print "\nStarting benign.csv"
     csvFilename = "benign.csv"
     csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
     # columns start at 0
     y = "3"
     # cols 0-13. 3 is output
     # no member id in this one
     for maxx in range(4,14):
         x = range(maxx)
         x.remove(3) # 3 is output
         x = ",".join(map(str,x))
         print "\nx:", x
         print "y:", y
         
         # solver can be ADMM
         kwargs = {'x': x, 'y':  y,\
              'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1}
         # fails with n_folds
         print "Not doing n_folds with benign. Fails with 'unable to solve?'"
         glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
         # no longer look at STR?
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         sys.stdout.write('.')
         sys.stdout.flush() 
Example #16
0
    def test_C_prostate(self):
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        csvFilename = "prostate.csv"
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        for maxx in range(2,9):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            # solver can be ADMM. standardize normalizes the data.
            kwargs = {'x': x, 'y':  y, 'n_folds': 5,\
                'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            h2o.check_sandbox_for_errors()
            sys.stdout.write('.')
            sys.stdout.flush() 
Example #17
0
    def test_rapids_basic_with_funs_noinc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for i in range(100):
            if i==0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind $v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ $v #1))'
                # execExpr1 = '(+ $v #1)'
                # add to itself?
                execExpr1 = '(+ $v $v)'
                funs = '[(def anon {v} %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ $v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon $v2))'
                execExpr2 = '(= !v2 (+ $v2 #1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15)


            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Example #18
0
    def test_exec2_operators4(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()
        
        bigExecExpr = ""
        expCnt = 0

        for t in range(200):
            execExpr = random.choice(exprList)
            bigExecExpr += execExpr + ";"
            h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4)
            expCnt += 1
            # limit to 2 expressions. 
            # Also: functions must be solitary
            # Also: ifelse() must be solitary
            # Also: ternary operators must be solitary
            if expCnt > 2 or 'function' in execExpr or 'ifelse' in execExpr or "?" in execExpr:
                bigExecExpr = ""
                expCnt = 0
                

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Example #19
0
 def setUpClass(cls):
     start = time.time()
     h2o_hosts.build_cloud_with_hosts(node_count, base_port=base_port, 
         use_flatfile=True, java_heap_GB=1)
     print "Cloud of", len(h2o.nodes), "built in", time.time()-start, "seconds"
     h2o.verify_cloud_size()
     h2o.check_sandbox_for_errors()
    def test_sort_of_prostate_with_row_schmoo(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"

        rowData = rand_rowData()
        totalRows = 1000000
        write_syn_dataset(csvPathname, totalRows, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        # used to fail around 50 iterations..python memory problem
        for trial in range (40):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, rowData, num)
            totalRows += num
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=150, pollTimeoutSecs=150)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
    def test_NOPASS_GLM2_weight_nan_fail(self):
        h2o.beta_features = True
        csvPathname = 'covtype/covtype.20k.data'
        hex_key = 'covtype.20k.hex'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put')
        kwargs = {
            'destination_key': 'GLM_model_python_0_default_0', 
            'family': 'tweedie', 
            'tweedie_variance_power': 1.9999999, 
            'max_iter': 10, 
            'alpha': 0, 
            'lambda': 0,
            'response': 54, 
        }

        for trial in range(3):
            # params is mutable. This is default.
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs)
            h2o.check_sandbox_for_errors()
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
    def test_rapids_funs_basic2(self):
        if 1==1:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'
        else:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for trial in range(5):
            for execExpr in funsList:
                funs = '[%s]' % execExpr
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, 
                    timeoutSecs=4)
                execExpr2 = '(= !junk (apply %r1 #2 %anon))' 
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, 
                    timeoutSecs=15)
                # rows might be zero!
                if execResult['num_rows'] or execResult['num_cols']:
                    keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_GLM_params_rand2_4082088627997819015(self):
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype')
        paramDict = define_params()
        for trial in range(40):
            # params is mutable. This is default.
            params = {
                'y': 54, 
                'n_folds' : 3, 
                'family' : 'binomial', 
                'max_iter' : 5, 
                'case': 1, 
                'alpha': 0, 
                'lambda': 0
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10)
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            elapsed = time.time() - start
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            # FIX! I suppose we have the problem of stdout/stderr not having flushed?
            # should hook in some way of flushing the remote node stdout/stderr
            h2o.check_sandbox_for_errors()
            
            print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Trial #", trial, "completed\n"
Example #24
0
def import_parse(node=None, schema='local', bucket=None, path=None,
    src_key=None, hex_key=None, 
    timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, **kwargs):

    ## if h2o.beta_features:
    ##     print "HACK: temporarily disabling Summary always in v2 import_parse"
    ##     doSummary = False

    if not node: node = h2o.nodes[0]

    (importResult, importPattern) = import_only(node, schema, bucket, path,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, doSummary, src_key, **kwargs)

    h2o.verboseprint("importPattern:", importPattern)
    h2o.verboseprint("importResult", h2o.dump_json(importResult))

    parseResult = parse_only(node, importPattern, hex_key,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, **kwargs)
    h2o.verboseprint("parseResult:", h2o.dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    if doSummary:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        h2o.check_sandbox_for_errors()
        node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs)
        # for now, don't worry about error isolating summary 
    else:
        # isolate a parse from the next thing
        h2o.check_sandbox_for_errors()

    return parseResult
    def test_rf_float_rand2_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 10000
        write_syn_dataset(csvPathname, totalRows, headerData)

        for trial in range (5):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, num)
            totalRows += num
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            kwargs = {'ntrees': 5, 'max_depth': 5}
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=60, pollTimeoutSecs=60, **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Example #26
0
 def test_prostate_then_prostate_long_parse(self):
     print "\nput and parse of same file, but both key and key2 are the h2o defaults..always different"
     for trial in range(10):
         start = time.time()
         key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate_long.csv.gz"))
         print "trial #", trial, "parse end on ", "prostate_long.csv.gz", "took", time.time() - start, "seconds"
         h2o.check_sandbox_for_errors()
Example #27
0
    def test_C_prostate(self):
        print "\nStarting prostate.csv"
        # columns start at 0
        y = "1"
        csvFilename = "prostate.csv"
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')

        for maxx in range(2,9):
            x = range(maxx)
            x.remove(0) # 0 is member ID. not used
            x.remove(1) # 1 is output
            x = ",".join(map(str,x))
            print "\nx:", x
            print "y:", y

            # solver can be ADMM. standardize normalizes the data.
            kwargs = {'x': x, 'y':  y, 'n_folds': 5,\
                'expert_settings': 1, 'lsm_solver': 'GenGradient', 'standardize':1}
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=30, **kwargs)
            # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
            h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
            h2o.check_sandbox_for_errors()
            sys.stdout.write('.')
            sys.stdout.flush() 
Example #28
0
 def test_B_benign(self):
     print "\nStarting benign.csv"
     csvFilename = "benign.csv"
     csvPathname = 'logreg/' + csvFilename
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
     # columns start at 0
     y = "3"
     # cols 0-13. 3 is output
     # no member id in this one
     for maxx in range(4,14):
         x = range(maxx)
         x.remove(3) # 3 is output
         x = ",".join(map(str,x))
         print "\nx:", x
         print "y:", y
         
         # solver can be ADMM
         kwargs = {'x': x, 'y':  y,\
              'expert_settings': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1}
         # fails with n_folds
         print "Not doing n_folds with benign. Fails with 'unable to solve?'"
         glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=30, **kwargs)
         # no longer look at STR?
         h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
         h2o.check_sandbox_for_errors()
         sys.stdout.write('.')
         sys.stdout.flush() 
    def test_rapids_vec_fail1(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6),int(100e6),int(10e6)):
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            execExpr = '(= !v (+ %v %v))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30)
            elapsed2 = time.time() - start

            if execResult['num_rows']:
                keys.append(execExpr)
            
            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)


        if 1==1:
            xLabel = 'vector length'
            eLabel = 'elapsed (create v)'
            fLabel = 'elapsed (v = v + v)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_1ktrees_job_cancel_many_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range (1,5):
            # random 0 or 1 delay
            delay = random.uniform(0,1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3)
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
Example #31
0
    def test_parse_time(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = None
        colCount = COLS
        # rowCount = 1000
        rowCount = ROWS
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)

        for trial in range(20):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            # src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            hex_key=hex_key)
            print "A trial #", trial
            # optional. only needed to extract parse_key?
            pA = h2o_cmd.ParseObj(parseResultA,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(key=pA.parse_key,
                                      csvPathname=csvDownloadPathname)

            # do a little testing of saving the key as a csv
            # remove the original parsed key. source was already removed by h2o
            if 1 == 0:
                h2o.nodes[0].remove_key(pA.parse_key)

            # interesting. what happens when we do csv download with time data?
            parseResultB = h2i.import_parse(path=csvDownloadPathname,
                                            schema='put',
                                            hex_key=hex_key)
            print "B trial #", trial
            pB = h2o_cmd.ParseObj(parseResultB,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pB.numRows
            print pB.numCols
            print pB.parse_key
            iB = h2o_cmd.InspectObj(pB.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            # these checks are redundant now
            self.assertEqual(
                iA.missingList, iB.missingList,
                "missingValuesList mismatches after re-parse of downloadCsv result"
            )
            self.assertEqual(
                iA.numCols, iB.numCols,
                "numCols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # so I guess that's okay. So allow for an extra row here.
            self.assertEqual(iA.numRows, iB.numRows,
                "pA.numRows: %s pB.numRows: %s mismatch after re-parse of downloadCsv result" % \
                (iA.numRows, iB.numRows) )
            print "H2O writes the internal format (number) out for time."

            # ==> syn_time.csv <==
            # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30
            # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30

            # ==> csvDownload.csv <==
            # "0","1","2","3","4","5"
            # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12

            h2o.check_sandbox_for_errors()
Example #32
0
 def tearDown(self):
     h2o.check_sandbox_for_errors()
Example #33
0
    def test_csv_download_libsvm(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5000, 10000, 'cK', 120),
            (10000, 10000, 'cL', 120),
            (50000, 10000, 'cM', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        trial = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            trial += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            start = time.time()
            # Summary is kind of slow. should I do it separately
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            hex_key=hex_key,
                                            timeoutSecs=timeoutSecs,
                                            doSummary=False)
            print "\nA Trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=timeoutSecs)
            missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname)
            numColsA = inspect['numCols']
            numRowsA = inspect['numRows']
            byteSizeA = inspect['byteSize']

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            print "\nStarting csv download to", csvDownloadPathname, "rowCount:", rowCount, "colCount:", colCount
            start = time.time()
            h2o.nodes[0].csv_download(src_key=hex_key,
                                      csvPathname=csvDownloadPathname)
            print "csv_download end.", 'took', time.time(
            ) - start, 'seconds. Originally from:', csvFilename

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname,
                                            schema='put',
                                            hex_key=hex_key,
                                            timeoutSecs=timeoutSecs)
            print "\nB Trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=timeoutSecs)
            missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname)
            numColsB = inspect['numCols']
            numRowsB = inspect['numRows']
            byteSizeB = inspect['byteSize']

            self.assertEqual(
                missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result"
            )
            self.assertEqual(
                numColsA, numColsB,
                "numCols mismatches after re-parse of downloadCsv result %d %d"
                % (numColsA, numColsB))
            self.assertEqual(
                numRowsA, numRowsB,
                "numRows mismatches after re-parse of downloadCsv result %d %d"
                % (numRowsA, numRowsB))

            if DO_BYTESIZE_COMPARE:
                self.assertEqual(
                    byteSizeA, byteSizeB,
                    "byteSize mismatches after re-parse of downloadCsv result %d %d"
                    % (byteSizeA, byteSizeB))

            h2o.check_sandbox_for_errors()
Example #34
0
    def test_parse_csv_download(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"

        rowData = rand_rowData()
        totalRows = 1000000
        write_syn_dataset(csvPathname, totalRows, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        # failed around 50 trials..python memory problem
        for trial in range(20):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, rowData, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "\nA trial #", trial, "totalRows:", totalRows, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname)
            num_colsA = inspect['num_cols']
            num_rowsA = inspect['num_rows']
            row_sizeA = inspect['row_size']
            value_size_bytesA = inspect['value_size_bytes']

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key,
                                      csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "B trial #", trial, "totalRows:", totalRows, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname)
            num_colsB = inspect['num_cols']
            num_rowsB = inspect['num_rows']
            row_sizeB = inspect['row_size']
            value_size_bytesB = inspect['value_size_bytes']

            self.assertEqual(
                missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result"
            )
            self.assertEqual(
                num_colsA, num_colsB,
                "num_cols mismatches after re-parse of downloadCsv result")
            self.assertEqual(
                num_rowsA, num_rowsB,
                "num_rows mismatches after re-parse of downloadCsv result")
            self.assertEqual(
                row_sizeA, row_sizeB,
                "row_size mismatches after re-parse of downloadCsv result")
            self.assertEqual(
                value_size_bytesA, value_size_bytesB,
                "value_size_bytes mismatches after re-parse of downloadCsv result"
            )

            h2o.check_sandbox_for_errors()
    def test_rapids_basic_with_funs_inc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for i in range(2):
            if i==0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0;#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind %v1 %v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ %v #1))'
                # execExpr1 = '(+ %v #1)'
                # add to itself?
                execExpr1 = '(+ %v %v)'
                funs = '[(def anon { v } %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon %v2))'
                execExpr2 = '(= !v1 (anon %v1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey=None, timeoutSecs=5)

            print "result:", result

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Example #36
0
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
                h2o_cmd.infoFromInspect(inspect, csvPathname)

                if noPoll:
                    if (i+1) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                        # parseResult = h2i.import_parse(path=importFolderPath + "/" + csvFilepattern,
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        start = time.time()
                        parseResult = h2i.import_parse(path=csvPathname,
                            hex_key=csvFilename + ".hex", 
                            timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
Example #37
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                        x.remove(i)
                        ignore_x.append(i)
                    x.remove(378)

                    # add one since we are no longer 0 based offset
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    # execExpr="A.hex=%s" % parseResult['destination_key']
                    # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
Example #38
0
    def test_exec2_xorsum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                
                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
                        start = time.time()
                        (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300)
                        print r, 'exec took', time.time() - start, 'seconds'
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()
                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult!=expectedUllSum:
                            raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
Example #39
0
    def test_parse_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/libsvm"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("covtype.binary.svm", "cC", 30, 1, 2, True, True),
            ("mnist_train.svm", "cM", 30, 0, 9, False, False),
            # multi-label target like 1,2,5 ..not sure what that means
            # ("tmc2007_train.svm",  "cJ", 30, 0, 21.0, False, False),
            # illegal non-ascending cols
            # ("syn_6_1000_10.svm",  "cK", 30, -36, 36, True, False),
            # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False),
            # fails csvDownload
            ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False),
            ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False),
            ("news20.svm", "cH", 30, 1, 20, False, False),
            ("connect4.svm", "cB", 30, -1, 1, False, False),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False)
            ("gisette_scale.svm", "cF", 30, -1, 1, False, False),
            ("mushrooms.svm", "cG", 30, 1, 2, False, False),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, expectedCol0Min, expectedCol0Max,
             enableDownloadReparse, enableSizeChecks) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvPathname, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # INSPECT******************************************
            start = time.time()
            inspectFirst = h2o_cmd.runInspect(None,
                                              parseKey['destination_key'],
                                              timeoutSecs=360)
            print "Inspect:", parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"
            h2o_cmd.infoFromInspect(inspectFirst, csvFilename)
            # look at the min/max for the target col (0) and compare to expected for the dataset

            imin = inspectFirst['cols'][0]['min']
            imax = inspectFirst['cols'][0]['max']

            if expectedCol0Min:
                self.assertEqual(
                    imin,
                    expectedCol0Min,
                    msg='col %s min %s is not equal to expected min %s' %
                    (0, imin, expectedCol0Min))
            if expectedCol0Max:
                self.assertEqual(
                    imax,
                    expectedCol0Max,
                    msg='col %s max %s is not equal to expected max %s' %
                    (0, imax, expectedCol0Max))

            print "\nmin/max for col0:", imin, imax

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values,
            # to see if we have x specified well
            # figures out everything from parseKey['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            if DO_SUMMARY:
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0,
                    key=parseKey['destination_key'],
                    timeoutSecs=300,
                    noPrint=True)
                summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            if DO_DOWNLOAD_REPARSE and enableDownloadReparse:
                missingValuesListA = h2o_cmd.infoFromInspect(
                    inspectFirst, csvPathname)
                num_colsA = inspectFirst['num_cols']
                num_rowsA = inspectFirst['num_rows']
                row_sizeA = inspectFirst['row_size']
                value_size_bytesA = inspectFirst['value_size_bytes']

                # do a little testing of saving the key as a csv
                csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv"
                print "Trying csvDownload of", csvDownloadPathname
                h2o.nodes[0].csv_download(key=key2,
                                          csvPathname=csvDownloadPathname)

                # remove the original parsed key. source was already removed by h2o
                # don't have to now. we use a new name for key2B
                # h2o.nodes[0].remove_key(key2)
                start = time.time()
                key2B = key2 + "_B"
                parseKeyB = h2o_cmd.parseFile(csvPathname=csvDownloadPathname,
                                              key2=key2B)
                print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \
                    csvFilename, 'took', time.time() - start, 'seconds'
                inspect = h2o_cmd.runInspect(key=key2B)

                missingValuesListB = h2o_cmd.infoFromInspect(
                    inspect, csvPathname)
                num_colsB = inspect['num_cols']
                num_rowsB = inspect['num_rows']
                row_sizeB = inspect['row_size']
                value_size_bytesB = inspect['value_size_bytes']

                df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)

                for i, d in enumerate(df.difference):
                    # ignore mismatches in these
                    #  "variance"
                    #  "response.time"
                    #  "key"
                    if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d:
                        pass
                    else:
                        raise Exception(
                            "testing %s, found unexpected mismatch in df.difference[%d]: %s"
                            % (csvPathname, i, d))

                if DO_SIZE_CHECKS and enableSizeChecks:
                    # if we're allowed to do size checks. ccompare the full json response!
                    print "Comparing original inspect to the inspect after parsing the downloaded csv"
                    # vice_versa=True
                    self.assertGreater(len(df.difference), 29,
                        msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                            (len(df.difference), h2o.dump_json(df.difference)))

                    # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes
                    # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen
                    # make the check conditional based on the dataset
                    self.assertEqual(
                        row_sizeA, row_sizeB,
                        "row_size mismatches after re-parse of downloadCsv result %d %d"
                        % (row_sizeA, row_sizeB))
                    self.assertEqual(
                        value_size_bytesA, value_size_bytesB,
                        "value_size_bytes mismatches after re-parse of downloadCsv result %d %d"
                        % (value_size_bytesA, value_size_bytesB))

                print "missingValuesListA:", missingValuesListA
                print "missingValuesListB:", missingValuesListB
                self.assertEqual(
                    missingValuesListA, missingValuesListB,
                    "missingValuesList mismatches after re-parse of downloadCsv result"
                )
                self.assertEqual(
                    num_colsA, num_colsB,
                    "num_cols mismatches after re-parse of downloadCsv result %d %d"
                    % (num_colsA, num_colsB))
                self.assertEqual(
                    num_rowsA, num_rowsB,
                    "num_rows mismatches after re-parse of downloadCsv result %d %d"
                    % (num_rowsA, num_rowsB))

            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Example #40
0
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000
        avgMichalSize = 116561140
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize = 183
        if 1 == 1:
            # importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            # importFolderPathFull = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            # importFolderPath = 'more1_1200_link'
            importFolderPath = 'manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            # this pattern from browser correctly does 100 files, 1M rowsj
            # source_key=*/home/0xdiag/datasets/manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz
            csvFilenameAll = [
                ("file_1.dat.gz", "file_1_A.dat.gz", 1 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz",
                 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz",
                 100 * avgMichalSize, 3600),

                # ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
                # ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz",
                 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz",
                 140 * avgMichalSize, 3600),

                # ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
                # ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),

                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
            ]

        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 180
        retryDelaySecs = 10

        localhost = h2o.decide_if_localhost()
        if localhost:
            tryHeap = 4
            h2o.build_cloud(2,
                            java_heap_GB=tryHeap,
                            base_port=base_port,
                            enable_benchmark_log=True)
        else:
            tryHeap = 28
            h2o_hosts.build_cloud_with_hosts(1,
                                             java_heap_GB=tryHeap,
                                             base_port=base_port,
                                             enable_benchmark_log=True)

        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2
            h2o.beta_features = True

            for trial in range(trialMax):
                # (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*")

                if DO_IMPORT_CHECK:
                    for i in range(2):
                        csvPathname = importFolderPath + "/" + csvFilepattern
                        (importResult, importPattern) = h2i.import_only(
                            bucket='home-0xdiag-datasets',
                            path=csvPathname,
                            schema='local',
                            timeoutSecs=timeoutSecs)

                        importFullList = importResult['files']
                        importFailList = importResult['fails']
                        print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                            importFailList)
                        # creates csvFilename.hex from file in importFolder dir

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message(
                    "Parse " + csvFilename +
                    " Start--------------------------------")
                csvPathname = importFolderPath + "/" + csvFilepattern
                start = time.time()
                parseResult = h2i.import_parse(
                    bucket='home-0xdiag-datasets',
                    path=csvPathname,
                    schema='local',
                    hex_key=csvFilename + ".hex",
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             timeoutSecs=360)
                h2o_cmd.infoFromInspect(inspect, csvPathname)

                if noPoll:
                    if (i + 1) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2,
                         timeoutSecs) = csvFilenameList[i + 1]
                        # parseResult = h2i.import_parse(path=importFolderPath + "/" + csvFilepattern,
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        start = time.time()
                        parseResult = h2i.import_parse(
                            path=csvPathname,
                            hex_key=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(
                            None,
                            parseResult['destination_key'],
                            timeoutSecs=360)
                        h2o_cmd.infoFromInspect(inspect, csvPathname)

                    if (i + 2) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3,
                         timeoutSecs) = csvFilenameList[i + 2]
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        parseResult = h2i.import_parse(
                            path=csvPathname,
                            hex_key=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(
                            None,
                            parseResult['destination_key'],
                            timeoutSecs=360)
                        h2o_cmd.infoFromInspect(inspect, csvPathname)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                                          timeoutSecs=timeoutSecs,
                                          benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                if totalBytes is not None:
                    fileMBS = (totalBytes / 1e6) / elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseResult['response'][
                    'time']
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']

                # BUG here?
                if not noPoll:
                    pass
                    # We should be able to see the parse result?
                    # h2o_cmd.check_enums_from_inspect(parseResult)

                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseResult['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)'
                execExpr = 'a = slice(' + origKey + ',1,200)'
                # h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRF takes the parseResult directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRF(trees=1,depth=25,parseResult=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542)  # don't include the output column
                    # remove the output too! (378)
                    for i in [
                            3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19,
                            20, 424, 425, 426, 540, 541, 378
                    ]:
                        x.remove(i)
                    x = ",".join(map(str, x))

                    GLMkwargs = {
                        'x': x,
                        'y': 378,
                        'case': 15,
                        'case_mode': '>',
                        'max_iter': 10,
                        'n_folds': 1,
                        'alpha': 0.2,
                        'lambda': 1e-5
                    }
                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult,
                                         timeoutSecs=timeoutSecs,
                                         **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                # print "Waiting 30 secs"
                # time.sleep(30)

                h2o_cmd.checkKeyDistribution()
                h2i.delete_keys_from_import_result(pattern=csvFilename,
                                                   importResult=importResult)
                h2o.nodes[0].remove_all_keys()

                ### time.sleep(3600)

                ### h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush()
    def test_parse_time_rand_fvec_NOPASS(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        colCount = 6
        rowCount = 10
        headerData = rand_header(colCount)
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)

        for trial in range (1):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key)
            print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsA = inspect['numRows']
            numColsA = inspect['numCols']

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100,
                numCols=numColsA, numRows=numRowsA, noPrint=True)

            print summaryResult
            h2o_cmd.infoFromSummary(summaryResult)
            (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)


            if constantValuesDictA or enumSizeDictA:
                raise Exception("Should be empty?  constantValuesDictA %s enumSizeDictA %s" % (constantValuesDictA, enumSizeDictA))

            print "missingValuesListA", missingValuesListA

            # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty")
            self.assertEqual(numColsA, colCount)
            self.assertEqual(numRowsA, rowCount)

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            # interesting. what happens when we do csv download with time data?
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key)
            print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsB = inspect['numRows']
            numColsB = inspect['numCols']
            print "missingValuesListB", missingValuesListB
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100,
                numCols=numColsB, numRows=numRowsB, noPrint=True)
            (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)
            if constantValuesDictB or enumSizeDictB:
                raise Exception("Should be empty?  constantValuesDictB %s enumSizeDictB %s" % (constantValuesDictB, enumSizeDictB))

            self.assertEqual(missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result")
            self.assertEqual(numColsA, numColsB,
                "numCols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # but in this dataset we have a header too, so the row counts should be equal
            # if not, maybe the parse of our dataset didn't detect a row
            self.assertEqual(numRowsA, numRowsB,
                "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB) )

            # FIX! should do some comparison of values? 
            # maybe can use exec to checksum the columns and compare column list.
            # or compare to expected values? (what are the expected values for the number for time inside h2o?)

            # FIX! should compare the results of the two parses. The infoFromInspect result?
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Example #42
0
    def test_exec2_xorsum2(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(3):
            ullResultList = []
            NUM_FORMAT_CASES = h2o_util.fp_format()
            for (rowCount, colCount, hex_key, expectedMin, expectedMax,
                 expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(
                    None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname

                sel = random.randint(0, NUM_FORMAT_CASES - 1)
                (expectedUllSum, expectedFpSum) = write_syn_dataset(
                    csvPathname, rowCount, colCount, expectedMin, expectedMax,
                    SEEDPERFILE, sel)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(
                    expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(
                    expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               timeoutSecs=3000,
                                               retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for repeate in range(3):
                        start = time.time()
                        (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0],
                                                               execExpr,
                                                               resultKey=None,
                                                               timeoutSecs=300)
                        print 'exec took', time.time() - start, 'seconds'
                        print "execResult:", h2o.dump_json(execResult)
                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                            ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                            expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                            expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way. needed when integers are parsed

                        # okay for a couple of lsbs to be wrong, due to conversion from stringk
                        # ullResult (0.16x): 0x02c1a21f923cee96   2.15698793923e-295
                        # expectedUllSum (0.16x): 0x02c1a21f923cee97   2.15698793923e-295
                        # expectedFpSum (0.16x): 0x42f054af32b3c408   2.87294442126e+14

                        # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them.
                        # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues
                        if ullResult != expectedUllSum and (
                                abs(ullResult - expectedUllSum) >
                                ALLOWED_DELTA):
                            emsg = "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (
                                ullResult, expectedUllSum)
                            if STOP_ON_ERROR:
                                raise Exception(emsg)
                            else:
                                print emsg

                        # print "%30s" % "hex(bitResult):", hex(ullResult)

                    h2o.check_sandbox_for_errors()

                    print "first result was from a sum. others are xorsum"
                    print "ullResultList:"
                    for ullResult, fpResult in ullResultList:
                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                            ullResult, fpResult)

                    print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                        expectedUllSum, expectedUllSumAsDouble)
                    print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                        expectedFpSumAsLongLong, expectedFpSum)