def test_GLMGrid_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')

        y = "1"
        x = range(9)
        x.remove(0) # 0. member ID. not used.
        x.remove(1) # 1 is output
        x = ','.join(map(str, x))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y

        # FIX! thresholds is used in GLMGrid. threshold is used in GLM
        # comma separated means use discrete values
        # colon separated is min/max/step
        # FIX! have to update other GLMGrid tests
        kwargs = {
            'x': x, 'y':  y, 'n_folds': 2, 
            'beta_eps': 1e-4,
            'lambda': '1e-8:1e3:100', 
            'alpha': '0,0.5,1',
            'thresholds': '0:1:0.01'
            }

        gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs)
        colNames = ['D','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']
        # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs)
        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #2
0
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting parse of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename, timeoutSecs=10)
    y = "10"
    x = ""
    # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have
    kwargs = {
        "x": x,
        "y": y,
        "case": "1",
        "destination_key": "gg",
        # better classifier it flipped? (better AUC?)
        "max_iter": 10,
        "case": -1,
        "case_mode": "=",
        "num_cross_validation_folds": 0,
        "lambda": "1e-8,1e-4,1e-3",
        "alpha": "0,0.25,0.8",
        "thresholds": "0.2:0.8:0.1",
    }

    start = time.time()
    print "\nStarting GLMGrid of", csvFilename
    glmGridResult = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
    print "GLMGrid in", (time.time() - start), "secs (python)"

    h2o_glm.simpleCheckGLMGrid(self, glmGridResult, **kwargs)
Example #3
0
    def test_GLM2Grid_basic_benign(self):
        h2o.beta_features = True
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        y = "3"
        print "y:", y
        
        kwargs = {
            'ignored_cols': '0,1', 
            'response':  y, 
            'n_folds': 0, 
            'lambda': '1e-8:1e-2:100', 
            'alpha': '0,0.5,1',
            }
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        # the gridded params make it grid..just call GLM2
        gg = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=120, **kwargs)
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK',
                     'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]

        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #4
0
    def test_GLMGrid_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = "logreg/" + csvFilename
        parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put")

        y = "1"
        x = range(9)
        x.remove(0)  # 0. member ID. not used.
        x.remove(1)  # 1 is output
        x = ",".join(map(str, x))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y

        # FIX! thresholds is used in GLMGrid. threshold is used in GLM
        # comma separated means use discrete values
        # colon separated is min/max/step
        # FIX! have to update other GLMGrid tests
        kwargs = {
            "x": x,
            "y": y,
            "n_folds": 2,
            "beta_eps": 1e-4,
            "lambda": "1e-8:1e3:100",
            "alpha": "0,0.5,1",
            "thresholds": "0:1:0.01",
        }

        gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs)
        colNames = ["D", "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]
        # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs)
        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
    def test_GLM2grid_covtype_many(self):
        h2o.beta_features = True
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'response': y,
            'family': 'gaussian',
            'n_folds': 2,
            'max_iter': max_iter,
            'beta_epsilon': 1e-3,
            'lambda': '0,0.5,0.8',
            'alpha': '0,1e-8,1e-4',
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)

            # print "glmResult:", h2o.dump_json(glmResult)
            # assuming it doesn't complete right away, this is the first response
            # it differs for the last response
            job_key = glmResult['job_key']
            grid_key = glmResult['destination_key']
            jobs.append( (job_key, grid_key) )
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(4):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', 
                src_key=src_key, hex_key=hex_key, 
                timeoutSecs=10, noPoll=True, doSummary=False)

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        # 2/GLMGridView.html?grid_key=asd
        # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN
        # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1
        for job_key, grid_key in jobs:
            gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key)
            h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs
Example #6
0
    def test_GLMGrid_basic_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        y = "3"
        x = range(14)
        x.remove(0) # 0. skipping causes coefficient of 0 when used alone
        x.remove(3) # 3 is output
        x = ','.join(map(str, x))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y
        
        kwargs = {
            'x': x, 'y':  y, 'n_folds': 0, 
            'lambda': '1e-8:1e-2:100', 
            'alpha': '0,0.5,1',
            'thresholds': '0:1:0.01'
            }
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"

        gg = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=120, **kwargs)
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK',
                     'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]

        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #7
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting parse of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket,
                                   path=csvPathname,
                                   schema='put',
                                   hex_key=csvFilename + ".hex",
                                   timeoutSecs=10)
    y = "10"
    # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have
    kwargs = {
        'response': y,
        'max_iter': 10,
        'n_folds': 2,
        'lambda': '1e-8,1e-4,1e-3',
        'alpha': '0,0.25,0.8',
    }

    start = time.time()
    print "\nStarting GLMGrid of", csvFilename
    glmGridResult = h2o_cmd.runGLM(parseResult=parseResult,
                                   timeoutSecs=timeoutSecs,
                                   **kwargs)
    print "GLMGrid in", (time.time() - start), "secs (python)"

    # still get zero coeffs..best model is AUC = 0.5 with intercept only.
    h2o_glm.simpleCheckGLMGrid(self,
                               glmGridResult,
                               allowZeroCoeff=True,
                               **kwargs)
Example #8
0
    def test_GLM2Grid_basic_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        y = "3"
        print "y:", y

        kwargs = {
            'ignored_cols': '0,1',
            'response': y,
            'n_folds': 0,
            'lambda': '1e-8:1e-2:100',
            'alpha': '0,0.5,1',
        }
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        # the gridded params make it grid..just call GLM2
        gg = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=120, **kwargs)
        # check the first in the models list. It should be the best
        colNames = [
            'STR', 'OBS', 'AGMT', 'FNDX', 'HIGD', 'DEG', 'CHK', 'AGP1', 'AGMN',
            'NLV', 'LIV', 'WT', 'AGLP', 'MST'
        ]

        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #9
0
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting parse of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10)
    y = "10"
    x = ""
    # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have
    kwargs = {
        'x': x, 'y':  y, 'case': '1', 'destination_key': 'gg',
        # better classifier it flipped? (better AUC?)
        'max_iter': 10,
        'case': -1, 'case_mode': '=',
        'num_cross_validation_folds': 0,
        'lambda': '1e-8,1e-4,1e-3',
        'alpha': '0,0.25,0.8',
        # hardwire threshold to 0.5 because the dataset is so senstive right around threshold
        # otherwise, GLMGrid will pick a model with zero coefficients, if it has the best AUC
        # to avoid my checker complaining about all zero coefficients, force the threshold to 0.5
        'thresholds': '0.5',
        # 'thresholds': '0.2:0.8:0.1'
        }

    start = time.time() 
    print "\nStarting GLMGrid of", csvFilename
    glmGridResult = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
    print "GLMGrid in",  (time.time() - start), "secs (python)"

    # still get zero coeffs..best model is AUC = 0.5 with intercept only.
    h2o_glm.simpleCheckGLMGrid(self,glmGridResult, allowZeroCoeff=True,**kwargs)
Example #10
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting parse of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=10)
    y = "10"
    x = ""
    # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have
    kwargs = {
        'x': x, 'y':  y, 'case': '1',
        # better classifier it flipped? (better AUC?)
        'max_iter': 10,
        'case': -1, 'case_mode': '=',
        'n_folds': 2,
        'lambda': '1e-8,1e-4,1e-3',
        'alpha': '0,0.25,0.8',
        # hardwire threshold to 0.5 because the dataset is so senstive right around threshold
        # otherwise, GLMGrid will pick a model with zero coefficients, if it has the best AUC
        # to avoid my checker complaining about all zero coefficients, force the threshold to 0.5
        'thresholds': '0.5',
        # 'thresholds': '0.2:0.8:0.1'
        }

    start = time.time() 
    print "\nStarting GLMGrid of", csvFilename
    glmGridResult = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
    print "GLMGrid in",  (time.time() - start), "secs (python)"

    # still get zero coeffs..best model is AUC = 0.5 with intercept only.
    h2o_glm.simpleCheckGLMGrid(self,glmGridResult, allowZeroCoeff=True,**kwargs)
    def test_GLMGrid_basic_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        y = "3"
        x = range(14)
        # 0 and 1 are id-like values
        x.remove(0)
        x.remove(1)

        x.remove(3) # 3 is output
        x = ','.join(map(str, x))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y
        
        kwargs = {
            'x': x, 'y':  y, 'n_folds': 0, 
            'lambda': '1e-8:1e-2:100', 
            'alpha': '0,0.5,1',
            'thresholds': '0:1:0.01'
            }
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs)
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK',
                     'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]

        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #12
0
    def test_GLMGrid_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')

        y = "1"
        # 0. member ID. not used.
        # 1 is output

        print "y:", y

        # FIX! thresholds is used in GLMGrid. threshold is used in GLM
        # comma separated means use discrete values
        # colon separated is min/max/step
        # FIX! have to update other GLMGrid tests
        kwargs = {
            'ignored_cols': 0, 
            'response':  y, 
            'n_folds': 2, 
            'lambda': '1e-8:1e3:100', 
            'alpha': '0,0.5,1',
            }

        # the gridded params make it grid..just call GLM2
        gg = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=120, **kwargs)
        colNames = ['D','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']
        # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs)
        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #13
0
    def test_GLM2grid_covtype_many(self):
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=20)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'response': y,
            'family': 'gaussian',
            'n_folds': 2,
            'max_iter': max_iter,
            'beta_epsilon': 1e-3,
            'lambda': '0,0.5,0.8',
            'alpha': '0,1e-8,1e-4',
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)

            # print "glmResult:", h2o.dump_json(glmResult)
            # assuming it doesn't complete right away, this is the first response
            # it differs for the last response
            job_key = glmResult['job_key']
            grid_key = glmResult['destination_key']
            jobs.append( (job_key, grid_key) )
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(4):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', 
                src_key=src_key, hex_key=hex_key, 
                timeoutSecs=10, noPoll=True, doSummary=False)

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        # 2/GLMGridView.html?grid_key=asd
        # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN
        # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1
        for job_key, grid_key in jobs:
            gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key)
            h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs
    def test_GLM2grid_convergence_1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 50, 'cD', 300),
            (100, 100, 'cE', 300),
            (100, 200, 'cF', 300),
            (100, 300, 'cG', 300),
            (100, 400, 'cH', 300),
            (100, 500, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE, rowCount,
                                                colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           schema='put')
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'max_iter': 10,
                'n_folds': 2,
                'beta_epsilon': 1e-4,
                'lambda': '1e-8:1e-3:1e2',
                'alpha': '0,0.5,.75',
            }

            kwargs['response'] = y

            for i in range(2):
                start = time.time()
                # get rid of the Jstack polling
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
                print "glm grid result", h2o.dump_json(glm)
                print 'glm #', i, 'end on', csvPathname, 'took', time.time(
                ) - start, 'seconds'
                # we can pass the warning, without stopping in the test, so we can
                # redo it in the browser for comparison
                h2o_glm.simpleCheckGLMGrid(self, glm, None, **kwargs)
    def test_GLM_convergence_1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 50,  'cD', 300),
            (100, 100, 'cE', 300),
            (100, 200, 'cF', 300),
            (100, 300, 'cG', 300),
            (100, 400, 'cH', 300),
            (100, 500, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema='put')
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                    'max_iter': 10, 
                    'weight': 1.0,
                    'link': 'familyDefault',
                    'n_folds': 2,
                    'beta_eps': 1e-4,
                    'lambda': '1e-8:1e-3:1e2',
                    'alpha': '0,0.5,.75',
                    'thresholds': '0,1,0.2'
                    }

            kwargs['y'] = y

            emsg = None
            for i in range(2):
                start = time.time()
                # get rid of the Jstack polling
                glm = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds'
                # we can pass the warning, without stopping in the test, so we can 
                # redo it in the browser for comparison
                h2o_glm.simpleCheckGLMGrid(self, glm, None, allowFailWarning=True, **kwargs)

            # gets the failed to converge, here, after we see it in the browser too
            if emsg is not None:
                raise Exception(emsg)
Example #16
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting parse of", csvFilename
    parseResult = h2i.import_parse(
        bucket=bucket, path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=20
    )
    y = "10"
    # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have
    kwargs = {"response": y, "max_iter": 10, "n_folds": 2, "lambda": "1e-8,1e-4,1e-3", "alpha": "0,0.25,0.8"}

    start = time.time()
    print "\nStarting GLMGrid of", csvFilename
    glmGridResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
    print "GLMGrid in", (time.time() - start), "secs (python)"

    # still get zero coeffs..best model is AUC = 0.5 with intercept only.
    h2o_glm.simpleCheckGLMGrid(self, glmGridResult, allowZeroCoeff=True, **kwargs)
    def test_GLM2grid_convergence_1(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 50, "cD", 300),
            (100, 100, "cE", 300),
            (100, 200, "cF", 300),
            (100, 300, "cG", 300),
            (100, 400, "cH", 300),
            (100, 500, "cI", 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_%s_%sx%s.csv" % (SEEDPERFILE, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema="put")
            print "Parse result['destination_key']:", parseResult["destination_key"]
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                "max_iter": 10,
                "n_folds": 2,
                "beta_epsilon": 1e-4,
                "lambda": "1e-8:1e-3:1e2",
                "alpha": "0,0.5,.75",
            }

            kwargs["response"] = y

            for i in range(2):
                start = time.time()
                # get rid of the Jstack polling
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                print "glm grid result", h2o.dump_json(glm)
                print "glm #", i, "end on", csvPathname, "took", time.time() - start, "seconds"
                # we can pass the warning, without stopping in the test, so we can
                # redo it in the browser for comparison
                h2o_glm.simpleCheckGLMGrid(self, glm, None, **kwargs)
    def test_GLMGrid_basic_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        y = "3"
        x = range(14)
        # 0 and 1 are id-like values
        x.remove(0)
        x.remove(1)

        x.remove(3)  # 3 is output
        x = ','.join(map(str, x))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y

        kwargs = {
            'x': x,
            'y': y,
            'n_folds': 0,
            'lambda': '1e-8:1e-2:100',
            'alpha': '0,0.5,1',
            'thresholds': '0:1:0.01'
        }
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        gg = h2o_cmd.runGLMGrid(parseResult=parseResult,
                                timeoutSecs=120,
                                **kwargs)
        # check the first in the models list. It should be the best
        colNames = [
            'STR', 'OBS', 'AGMT', 'FNDX', 'HIGD', 'DEG', 'CHK', 'AGP1', 'AGMN',
            'NLV', 'LIV', 'WT', 'AGLP', 'MST'
        ]

        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #19
0
    def test_GLMGrid_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')

        y = "1"
        x = range(9)
        x.remove(0)  # 0. member ID. not used.
        x.remove(1)  # 1 is output
        x = ','.join(map(str, x))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y

        # FIX! thresholds is used in GLMGrid. threshold is used in GLM
        # comma separated means use discrete values
        # colon separated is min/max/step
        # FIX! have to update other GLMGrid tests
        kwargs = {
            'x': x,
            'y': y,
            'n_folds': 2,
            'beta_eps': 1e-4,
            'lambda': '1e-8:1e3:100',
            'alpha': '0,0.5,1',
            'thresholds': '0:1:0.01'
        }

        gg = h2o_cmd.runGLMGrid(parseResult=parseResult,
                                timeoutSecs=120,
                                **kwargs)
        colNames = [
            'D', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL',
            'GLEASON'
        ]
        # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs)
        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #20
0
    def test_GLMGrid_basic_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        csvPathname = "logreg/" + csvFilename
        parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put")
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        y = "3"
        x = range(14)
        x.remove(0)  # 0. skipping causes coefficient of 0 when used alone
        x.remove(3)  # 3 is output
        x = ",".join(map(str, x))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y

        kwargs = {"x": x, "y": y, "n_folds": 0, "lambda": "1e-8:1e-2:100", "alpha": "0,0.5,1", "thresholds": "0:1:0.01"}
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs)
        # check the first in the models list. It should be the best
        colNames = [
            "STR",
            "OBS",
            "AGMT",
            "FNDX",
            "HIGD",
            "DEG",
            "CHK",
            "AGP1",
            "AGMN",
            "NLV",
            "LIV",
            "WT",
            "AGLP",
            "MST",
        ]

        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #21
0
    def test_B_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        y = "3"
        xList = []  
        for appendx in xrange(14):
            if (appendx == 0): 
                print "\nSkipping 0. Causes coefficient of 0 when used alone"
            elif (appendx == 3): 
                print "\n3 is output."
            else:
                xList.append(appendx)

        x = ','.join(map(str, xList))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y
        
        kwargs = {
            'x': x, 'y':  y, 'num_cross_validation_folds': 0, 
            'lambda': '1e-8:1e-2:100', 
            'alpha': '0,0.5,1',
            'thresholds': '0:1:0.01'
            }
        # fails with num_cross_validation_folds
        print "Not doing num_cross_validation_folds with benign. Fails with 'unable to solve?'"

        gg = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=120, **kwargs)
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK',
                     'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]

        # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[-1]], **kwargs)
        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #22
0
    def test_GLMGrid_basic_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        y = "3"
        x = range(14)
        x.remove(0)  # 0. skipping causes coefficient of 0 when used alone
        x.remove(3)  # 3 is output
        x = ','.join(map(str, x))

        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y

        kwargs = {
            'x': x,
            'y': y,
            'n_folds': 0,
            'lambda': '1e-8:1e-2:100',
            'alpha': '0,0.5,1',
            'thresholds': '0:1:0.01'
        }
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"

        gg = h2o_cmd.runGLMGridOnly(parseKey=parseKey,
                                    timeoutSecs=120,
                                    **kwargs)
        # check the first in the models list. It should be the best
        colNames = [
            'STR', 'OBS', 'AGMT', 'FNDX', 'HIGD', 'DEG', 'CHK', 'AGP1', 'AGMN',
            'NLV', 'LIV', 'WT', 'AGLP', 'MST'
        ]

        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
Example #23
0
    def test_C_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename)
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        y = "1"
        xList = []  
        for appendx in xrange(9):
            if (appendx == 0):
                print "\n0 is member ID. not used"
            elif (appendx == 1):
                print "\n1 is output."
            else:
                xList.append(appendx)

        x = ','.join(map(str, xList))
        # just run the test with all x, not the intermediate results
        print "\nx:", x
        print "y:", y

        # FIX! thresholds is used in GLMGrid. threshold is used in GLM
        # comma separated means use discrete values
        # colon separated is min/max/step
        # FIX! have to update other GLMGrid tests
        kwargs = {
            'x': x, 'y':  y, 'num_cross_validation_folds': 2, 
            'beta_epsilon': 1e-4,
            'lambda': '1e-8:1e3:100', 
            'alpha': '0,0.5,1',
            'thresholds': '0:1:0.01'
            }

        gg = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=120, **kwargs)
        colNames = ['D','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']
        # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs)
        h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
    def test_parse_nflx_loop_s3n_hdfs(self):
        DO_GLM = False
        DO_GLMGRID = True
        USE_HOME2 = False
        USE_S3 = False
        noPoll = False
        benchmarkLogging = ['jstack', 'iostats']
        benchmarkLogging = ['iostats']
        benchmarkLogging = []
        # typical size of the michal files
        avgMichalSize = 116561140
        avgSynSize = 4020000
        synSize = 183

        if USE_HOME2:
            csvFilenameList = [
                # this should hit the "more" files too?
                ("00[0-4][0-9]_syn.csv.gz", "file_50.dat.gz", 50 * synSize, 700
                 ),
                ("[0][1][0-9][0-9]_.*", "file_100.dat.gz", 100 * synSize, 700),
                ("[0][0-4][0-9][0-9]_.*", "file_500.dat.gz", 500 * synSize,
                 700),
                ("[0][0-9][0-9][0-9]_.*", "file_1000.dat.gz", 1000 * synSize,
                 700),
                # ("10k_small_gz/[0-4][0-9][0-9][0-9]_.*", "file_5000.dat.gz", 5000 * synSize , 700),
                # ("10k_small_gz/[0-9][0-9][0-9][0-9]_.*", "file_10000.dat.gz", 10000 * synSize , 700),
            ]
        else:
            csvFilenameList = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz"),
                # 100 files takes too long on two machines?
                # I use different files to avoid OS caching effects
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz",
                 "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz",
                 "file_100_B.dat.gz", 100 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[1-2][0-5][0-9].dat.gz",
                 "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[1-2][0-5][0-9].dat.gz",
                 "file_120_B.dat.gz", 120 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[1-2][0-6][0-9].dat.gz",
                 "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[1-2][0-6][0-9].dat.gz",
                 "file_140_B.dat.gz", 140 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[1-2][0-7][0-9].dat.gz",
                 "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[1-2][0-7][0-9].dat.gz",
                 "file_160_B.dat.gz", 160 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[1-2][0-8][0-9].dat.gz",
                 "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[1-2][0-8][0-9].dat.gz",
                 "file_180_B.dat.gz", 180 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz",
                 "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz",
                 "file_200_B.dat.gz", 200 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz",
                 "file_300_A.dat.gz", 300 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz",
                 "file_300_B.dat.gz", 300 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz",
                 "file_300_C.dat.gz", 300 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz",
                 1 * avgMichalSize, 300),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz",
                 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz",
                 20 * avgMichalSize, 900),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz",
                 "file_50_A.dat.gz", 50 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_1[0-4][0-9].dat.gz",
                 "file_50_B.dat.gz", 50 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz",
                 "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz",
                 "file_100_B.dat.gz", 100 * avgMichalSize, 3600),
                ("[A]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz",
                 "file_A_200_x55.dat.gz", 200 * (avgMichalSize / 2), 7200),
                ("[A-B]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz",
                 "file_B_400_x55.dat.gz", 400 * (avgMichalSize / 2), 7200),
                ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz",
                 "file_C_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200),
                ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz",
                 "file_D_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200),
                ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz",
                 "file_E_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200),
                ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz",
                 "file_F_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200),
            ]

        print "Using the -.gz files from s3"
        # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz

        if USE_HOME2:
            bucket = "home2-0xdiag-datasets/1k_small_gz"
        else:
            bucket = "home-0xdiag-datasets"

        if USE_S3:
            URI = "s3://" + bucket
            protocol = "s3"
        else:
            URI = "s3n://" + bucket
            protocol = "s3n/hdfs"

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        pollTimeoutSecs = 180
        retryDelaySecs = 10
        # use i to forward reference in the list, so we can do multiple outstanding parses below
        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            ## for tryHeap in [54, 28]:
            h2oPerNode = 1
            # h1.4xlarge 60.5GB dram
            for tryHeap in [28]:

                print "\n", tryHeap, "GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse"
                # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
                jea = "-Dh2o.find-ByteBuffer-leaks=true"
                h2o_hosts.build_cloud_with_hosts(
                    h2oPerNode,
                    java_heap_GB=tryHeap,
                    # java_extra_args=jea,
                    enable_benchmark_log=True,
                    timeoutSecs=120,
                    retryDelaySecs=10,
                    # all hdfs info is done thru the hdfs_config michal's ec2 config sets up?
                    # this is for our amazon ec hdfs
                    # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n
                    hdfs_name_node='10.78.14.235:9000',
                    hdfs_version='0.20.2')

                # don't raise exception if we find something bad in h2o stdout/stderr?
                h2o.nodes[0].sandbox_ignore_errors = True

                for trial in range(trialMax):
                    # since we delete the key, we have to re-import every iteration, to get it again
                    # s3n URI thru HDFS is not typical.
                    if USE_S3:
                        importResult = h2o.nodes[0].import_s3(bucket)
                    else:
                        importResult = h2o.nodes[0].import_hdfs(URI)

                    s3nFullList = importResult['succeeded']
                    for k in s3nFullList:
                        key = k['key']
                        # just print the first tile
                        # if 'nflx' in key and 'file_1.dat.gz' in key:
                        if csvFilepattern in key:
                            # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                            print "example file we'll use:", key
                            break
                        else:
                            ### print key
                            pass

                    ### print "s3nFullList:", h2o.dump_json(s3nFullList)
                    # error if none?
                    self.assertGreater(len(s3nFullList), 8,
                                       "Didn't see more than 8 files in s3n?")

                    s3nKey = URI + "/" + csvFilepattern
                    key2 = csvFilename + "_" + str(trial) + ".hex"
                    print "Loading", protocol, "key:", s3nKey, "to", key2
                    start = time.time()
                    parseKey = h2o.nodes[0].parse(
                        s3nKey,
                        key2,
                        timeoutSecs=timeoutSecs,
                        retryDelaySecs=retryDelaySecs,
                        pollTimeoutSecs=pollTimeoutSecs,
                        noPoll=noPoll,
                        benchmarkLogging=benchmarkLogging)

                    if noPoll:
                        if (i + 1) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes2,
                             timeoutSecs) = csvFilenameList[i + 1]
                            s3nKey = URI + "/" + csvFilepattern
                            key2 = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", s3nKey, "to", key2
                            parse2Key = h2o.nodes[0].parse(
                                s3nKey,
                                key2,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                        if (i + 2) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes3,
                             timeoutSecs) = csvFilenameList[i + 2]
                            s3nKey = URI + "/" + csvFilepattern
                            key2 = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", s3nKey, "to", key2
                            parse3Key = h2o.nodes[0].parse(
                                s3nKey,
                                key2,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                    elapsed = time.time() - start
                    print s3nKey, 'parse time:', parseKey['response']['time']
                    print "parse result:", parseKey['destination_key']
                    print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    # print stats on all three if noPoll
                    if noPoll:
                        # does it take a little while to show up in Jobs, from where we issued the parse?
                        time.sleep(2)
                        # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                        h2o_jobs.pollWaitJobs(
                            pattern=csvFilename,
                            timeoutSecs=timeoutSecs,
                            benchmarkLogging=benchmarkLogging)
                        # for getting the MB/sec closer to 'right'
                        totalBytes += totalBytes2 + totalBytes3
                        elapsed = time.time() - start
                        h2o.check_sandbox_for_errors()

                    if totalBytes is not None:
                        fileMBS = (totalBytes / 1e6) / elapsed
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, csvFilepattern,
                            csvFilename, fileMBS, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    # BUG here?
                    if not noPoll:
                        # We should be able to see the parse result?
                        h2o_cmd.check_enums_from_inspect(parseKey)

                    #**********************************************************************************
                    # Do GLM too
                    # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                    if DO_GLM or DO_GLMGRID:
                        # these are all the columns that are enums in the dataset...too many for GLM!
                        x = range(542)  # don't include the output column
                        # remove the output too! (378)
                        for i in [
                                3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18,
                                19, 20, 424, 425, 426, 540, 541, 378
                        ]:
                            x.remove(i)
                        x = ",".join(map(str, x))

                        if DO_GLM:
                            algo = 'GLM'
                            GLMkwargs = {
                                'x': x,
                                'y': 378,
                                'case': 15,
                                'case_mode': '>',
                                'family': 'binomial',
                                'max_iter': 10,
                                'n_folds': 2,
                                'alpha': 0.2,
                                'lambda': 1e-5
                            }
                            start = time.time()
                            glm = h2o_cmd.runGLMOnly(
                                parseKey=parseKey,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging,
                                **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLM(self, glm, None,
                                                   **GLMkwargs)

                        else:
                            algo = 'GLMGrid'
                            GLMkwargs = {
                                'x': x,
                                'y': 378,
                                'case': 15,
                                'case_mode': '>',
                                'family': 'binomial',
                                'max_iter': 10,
                                'n_folds': 1,
                                'beta_epsilon': 1e-4,
                                'lambda': '1e-4',
                                'alpha': '0,0.5',
                                'thresholds': '0.5'
                            }
                            start = time.time()
                            glm = h2o_cmd.runGLMGridOnly(
                                parseKey=parseKey,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging,
                                **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLMGrid(self, glm, None,
                                                       **GLMkwargs)

                        h2o.check_sandbox_for_errors()
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, algo, csvFilepattern,
                            csvFilename, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    #**********************************************************************************
                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."
                    ### storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz"
                    # have to do the pattern match ourself, to figure out what keys to delete
                    # we're deleting the keys in the initial import. We leave the keys we created
                    # by the parse. We use unique dest keys for those, so no worries.
                    # Leaving them is good because things fill up! (spill)
                    h2o_cmd.check_key_distribution()
                    h2o_cmd.delete_csv_key(csvFilename, s3nFullList)

                h2o.tear_down_cloud()
                # sticky ports? wait a bit.
                print "Waiting 30 secs before building cloud again (sticky ports?)"
                time.sleep(30)
    def test_GLMGrid_covtype_many(self):
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        x = ""

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'x': x,
            'y': y,
            'family': 'binomial',
            'link': 'logit',
            'n_folds': 2,
            'case_mode': '=',
            'case': 1,
            'max_iter': max_iter,
            'beta_eps': 1e-3,
            'lambda': '0,0.5,0.8',
            'alpha': '0,1e-8,1e-4',
            'parallelism': 1,
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            GLMResult = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)

            # print "GLMResult:", h2o.dump_json(GLMResult)
            job_key = GLMResult['response']['redirect_request_args']['job']
            model_key = GLMResult['response']['redirect_request_args']['destination_key']
            jobs.append( (job_key, model_key) )
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(10):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', 
                src_key=src_key, hex_key=hex_key, 
                timeoutSecs=10, noPoll=True, doSummary=False)

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        for job_key, model_key in jobs:
            GLMResult = h2o.nodes[0].GLMGrid_view(job=job_key, destination_key=model_key)
            h2o_glm.simpleCheckGLMGrid(self, GLMResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs
    def test_GLM_convergence_1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 50,  'cD', 300),
            (100, 100, 'cE', 300),
            (100, 200, 'cF', 300),
            (100, 300, 'cG', 300),
            (100, 400, 'cH', 300),
            (100, 500, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        USEKNOWNFAILURE = True
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            if USEKNOWNFAILURE:
                csvFilename = 'failtoconverge_100x50.csv'
                csvPathname = h2o.find_file('smalldata/logreg/' + csvFilename)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                    'max_iter': 10, 
                    'weight': 1.0,
                    'link': 'familyDefault',
                    'n_folds': 2,
                    'beta_epsilon': 1e-4,
                    #***********
                    'lambda': '1e-8:1e-3:1e2',
                    'alpha': '0,0.5,.75',
                    'thresholds': '0,1,0.2'
                    }

            if USEKNOWNFAILURE:
                kwargs['y'] = 50
            else:
                kwargs['y'] = y

            emsg = None
            for i in range(25):
                start = time.time()
                glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noise=("Jstack", None), **kwargs)
                print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds'
                # we can pass the warning, without stopping in the test, so we can 
                # redo it in the browser for comparison
                warnings = h2o_glm.simpleCheckGLMGrid(self, glm, None, allowFailWarning=True, **kwargs)

                # gets the failed to converge, here, after we see it in the browser too
                x = re.compile("[Ff]ailed")
                if warnings:
                    for w in warnings:
                        if (re.search(x,w)): 
                            # first
                            if emsg is None: emsg = w
                            print w
                if emsg: break
        
            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLMGridProgress")
                time.sleep(5)

            # gets the failed to converge, here, after we see it in the browser too
            if emsg is not None:
                raise Exception(emsg)
Example #27
0
    def test_parse_nflx_loop_s3n_hdfs(self):
        DO_GLM = True
        DO_GLMGRID = False
        USE_S3 = False
        noPoll = False
        benchmarkLogging = ['jstack','iostats']
        benchmarkLogging = ['iostats']
        benchmarkLogging = []
        # typical size of the michal files
        avgMichalSize = 116561140
        avgSynSize = 4020000
        synSize = 183

        csvFilenameList = [
            (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300),
            (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
            (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900),
            (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),
            # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use    
            (["A-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200),
            (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200),
            (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200),
        ]

        print "Using the -.gz files from s3"
        # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        pollTimeoutSecs = 180
        retryDelaySecs = 10
        # use i to forward reference in the list, so we can do multiple outstanding parses below
        for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):

            bucket = "home-0xdiag-datasets"
            ## for tryHeap in [54, 28]:
            h2oPerNode = 1
            # h1.4xlarge 60.5GB dram
            for tryHeap in [28]:
                if USE_S3:
                    protocol = "s3"
                else:
                    protocol = "s3n"
                print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse"
                
                # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
                # jea = "-Dh2o.find-ByteBuffer-leaks=true"
                h2o.init(h2oPerNode, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10)
                # java_extra_args=jea,

                # don't raise exception if we find something bad in h2o stdout/stderr?
                h2o.nodes[0].sandboxIgnoreErrors = True

                for trial in range(trialMax):
                    # import a list of folders, one at a time (hdfs import can't take pattern match
                    # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket
                    # too slow
                    for csvFolder in csvFolderList:
                        # since we delete the key, we have to re-import every iteration, to get it again
                        # s3n URI thru HDFS is not typical.
                        if USE_S3:
                            (importResult, importPattern) = h2i.import_only(
                                bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='s3')
                        else:
                            (importResult, importPattern) = h2i.import_only(
                                bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='hdfs')

                        foundKeys = 0
                        for s in importResult['succeeded']:
                            # just print the first tile
                            # if 'nflx' in key and 'file_1.dat.gz' in key: 
                            if csvFilepattern in s['key']:
                                # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                                print "example file we'll use:", s['key']
                                break
                            else:
                                pass
                            foundKeys += 1

                        ### print "s3nFullList:", h2o.dump_json(s3nFullList)
                        # error if none? 
                        self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?")

                    src_key = csvFilepattern
                    hex_key = csvFilename + "_" + str(trial) + ".hex"
                    print "Loading", protocol, "key:", src_key, "to", hex_key
                    start = time.time()
                    parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern,
                        timeoutSecs=timeoutSecs, 
                        retryDelaySecs=retryDelaySecs,
                        pollTimeoutSecs=pollTimeoutSecs,
                        noPoll=noPoll,
                        benchmarkLogging=benchmarkLogging)

                    if noPoll:
                        if (i+1) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                            src_key = csvFilepattern
                            hex_key = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", src_key, "to", hex_key
                            parse2Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                        if (i+2) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                            src_key = URI + csvFilepattern
                            hex_key = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", src_key, "to", hex_key
                            parse3Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilepattern,
                                timeoutSecs=timeoutSecs, 
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                    elapsed = time.time() - start
                    print "parse result:", parseResult['destination_key']
                    print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    # print stats on all three if noPoll
                    if noPoll:
                        # does it take a little while to show up in Jobs, from where we issued the parse?
                        time.sleep(2)
                        # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                        h2o_jobs.pollWaitJobs(pattern=csvFilename, 
                            timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                        # for getting the MB/sec closer to 'right'
                        totalBytes += totalBytes2 + totalBytes3
                        elapsed = time.time() - start
                        h2o.check_sandbox_for_errors()

                    if totalBytes is not None:
                        fileMBS = (totalBytes/1e6)/elapsed
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    y = 378
                    if not noPoll:
                        x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)


                    #**********************************************************************************
                    # Do GLM too
                    # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                    if DO_GLM or DO_GLMGRID:
                        # these are all the columns that are enums in the dataset...too many for GLM!
                        x = range(542) # don't include the output column
                        # remove the output too! (378)
                        for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]:
                            x.remove(i)
                        x = ",".join(map(str,x))

                        if DO_GLM:
                            algo = 'GLM'
                            GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial',
                                'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                            start = time.time()
                            glm = h2o_cmd.runGLM(parseResult=parseResult, 
                                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging, **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)

                        else:
                            algo = 'GLMGrid'
                            GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial',
                                'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4,
                                'lambda': '1e-4',
                                'alpha': '0,0.5',
                                'thresholds': '0.5'
                                }
                            start = time.time()
                            glm = h2o_cmd.runGLMGrid(parseResult=parseResult,
                                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging, **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs)

                        h2o.check_sandbox_for_errors()
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    #**********************************************************************************
                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."
                    ### storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz"
                    # have to do the pattern match ourself, to figure out what keys to delete
                    # we're deleting the keys in the initial import. We leave the keys we created
                    # by the parse. We use unique dest keys for those, so no worries.
                    # Leaving them is good because things fill up! (spill)
                    h2o_cmd.checkKeyDistribution()
                    h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

                h2o.tear_down_cloud()
                # sticky ports? wait a bit.
                print "Waiting 30 secs before building cloud again (sticky ports?)"
                time.sleep(30)
    def test_parse_nflx_loop_s3n_hdfs(self):
        DO_GLM = True
        DO_GLMGRID = False
        USE_HOME2 = False
        USE_S3 = False
        noPoll = False
        benchmarkLogging = ['jstack','iostats']
        benchmarkLogging = ['iostats']
        benchmarkLogging = []
        # typical size of the michal files
        avgMichalSize = 116561140
        avgSynSize = 4020000
        synSize = 183

        if USE_HOME2:
            csvFilenameList = [
                # this should hit the "more" files too?
                ("00[0-4][0-9]_syn.csv.gz", "file_50.dat.gz", 50 * synSize , 700),
                ("[0][1][0-9][0-9]_.*", "file_100.dat.gz", 100 * synSize , 700),
                ("[0][0-4][0-9][0-9]_.*", "file_500.dat.gz", 500 * synSize , 700),
                ("[0][0-9][0-9][0-9]_.*", "file_1000.dat.gz", 1000 * synSize , 700),
                # ("10k_small_gz/[0-4][0-9][0-9][0-9]_.*", "file_5000.dat.gz", 5000 * synSize , 700),
                # ("10k_small_gz/[0-9][0-9][0-9][0-9]_.*", "file_10000.dat.gz", 10000 * synSize , 700),
            ]
        else:
            csvFilenameList = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz"),
                # 100 files takes too long on two machines?
                # I use different files to avoid OS caching effects
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
                # ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),

                ("manyfiles-nflx-gz/file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
                # ("manyfiles-nflx-gz/file_[1-2][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600),

                ("manyfiles-nflx-gz/file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
                # ("manyfiles-nflx-gz/file_[1-2][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600),

                ("manyfiles-nflx-gz/file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
                # ("manyfiles-nflx-gz/file_[1-2][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600),

                ("manyfiles-nflx-gz/file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
                # ("manyfiles-nflx-gz/file_[1-2][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600),

                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
                # ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600),

                ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600),


                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
                ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),
                ("[A]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200),
                ("[A-B]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_B_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200),
                ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_C_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200),
                ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_D_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200),
                ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_E_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200),
                ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_F_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200),
            ]

        print "Using the -.gz files from s3"
        # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
    
        if USE_HOME2:
            bucket = "home2-0xdiag-datasets/1k_small_gz"
        else:
            bucket = "home-0xdiag-datasets"

        if USE_S3:
            URI = "s3://" + bucket
            protocol = "s3"
        else:
            URI = "s3n://" + bucket
            protocol = "s3n/hdfs"

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        pollTimeoutSecs = 180
        retryDelaySecs = 10
        # use i to forward reference in the list, so we can do multiple outstanding parses below
        for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            ## for tryHeap in [54, 28]:
            h2oPerNode = 1
            # h1.4xlarge 60.5GB dram
            for tryHeap in [14]:
                
                print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse"
                # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
                jea = "-Dh2o.find-ByteBuffer-leaks=true"
                h2o_hosts.build_cloud_with_hosts(h2oPerNode, java_heap_GB=tryHeap,
                    # java_extra_args=jea,
                    enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10,
                    # all hdfs info is done thru the hdfs_config michal's ec2 config sets up?
                    # this is for our amazon ec hdfs
                    # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n
                    hdfs_name_node='10.78.14.235:9000',
                    hdfs_version='0.20.2')

                # don't raise exception if we find something bad in h2o stdout/stderr?
                h2o.nodes[0].sandbox_ignore_errors = True

                for trial in range(trialMax):
                    # since we delete the key, we have to re-import every iteration, to get it again
                    # s3n URI thru HDFS is not typical.
                    if USE_S3:
                        importResult = h2o.nodes[0].import_s3(bucket)
                    else:
                        importResult = h2o.nodes[0].import_hdfs(URI)

                    s3nFullList = importResult['succeeded']
                    for k in s3nFullList:
                        key = k['key']
                        # just print the first tile
                        # if 'nflx' in key and 'file_1.dat.gz' in key: 
                        if csvFilepattern in key:
                            # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                            print "example file we'll use:", key
                            break
                        else:
                            ### print key
                            pass

                    ### print "s3nFullList:", h2o.dump_json(s3nFullList)
                    # error if none? 
                    self.assertGreater(len(s3nFullList),8,"Didn't see more than 8 files in s3n?")

                    s3nKey = URI + "/" + csvFilepattern
                    key2 = csvFilename + "_" + str(trial) + ".hex"
                    print "Loading", protocol, "key:", s3nKey, "to", key2
                    start = time.time()
                    parseKey = h2o.nodes[0].parse(s3nKey, key2,
                        timeoutSecs=timeoutSecs, 
                        retryDelaySecs=retryDelaySecs,
                        pollTimeoutSecs=pollTimeoutSecs,
                        noPoll=noPoll,
                        benchmarkLogging=benchmarkLogging)

                    if noPoll:
                        if (i+1) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                            s3nKey = URI + "/" + csvFilepattern
                            key2 = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", s3nKey, "to", key2
                            parse2Key = h2o.nodes[0].parse(s3nKey, key2,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                        if (i+2) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                            s3nKey = URI + "/" + csvFilepattern
                            key2 = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", s3nKey, "to", key2
                            parse3Key = h2o.nodes[0].parse(s3nKey, key2,
                                timeoutSecs=timeoutSecs, 
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                    elapsed = time.time() - start
                    print s3nKey, 'parse time:', parseKey['response']['time']
                    print "parse result:", parseKey['destination_key']
                    print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    # print stats on all three if noPoll
                    if noPoll:
                        # does it take a little while to show up in Jobs, from where we issued the parse?
                        time.sleep(2)
                        # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                        h2o_jobs.pollWaitJobs(pattern=csvFilename, 
                            timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                        # for getting the MB/sec closer to 'right'
                        totalBytes += totalBytes2 + totalBytes3
                        elapsed = time.time() - start
                        h2o.check_sandbox_for_errors()

                    if totalBytes is not None:
                        fileMBS = (totalBytes/1e6)/elapsed
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    # BUG here?
                    if not noPoll:
                        # We should be able to see the parse result?
                        h2o_cmd.check_enums_from_inspect(parseKey)

                    #**********************************************************************************
                    # Do GLM too
                    # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                    if DO_GLM or DO_GLMGRID:
                        # these are all the columns that are enums in the dataset...too many for GLM!
                        x = range(542) # don't include the output column
                        # remove the output too! (378)
                        for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]:
                            x.remove(i)
                        x = ",".join(map(str,x))

                        if DO_GLM:
                            algo = 'GLM'
                            GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'family': 'binomial',
                                'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                            start = time.time()
                            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, 
                                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging, **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)

                        else:
                            algo = 'GLMGrid'
                            GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'family': 'binomial',
                                'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4,
                                'lambda': '1e-4',
                                'alpha': '0,0.5',
                                'thresholds': '0.5'
                                }
                            start = time.time()
                            glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey,
                                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging, **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs)

                        h2o.check_sandbox_for_errors()
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    #**********************************************************************************
                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."
                    ### storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz"
                    # have to do the pattern match ourself, to figure out what keys to delete
                    # we're deleting the keys in the initial import. We leave the keys we created
                    # by the parse. We use unique dest keys for those, so no worries.
                    # Leaving them is good because things fill up! (spill)
                    h2o_cmd.check_key_distribution()
                    h2o_cmd.delete_csv_key(csvFilename, s3nFullList)

                h2o.tear_down_cloud()
                # sticky ports? wait a bit.
                print "Waiting 30 secs before building cloud again (sticky ports?)"
                time.sleep(30)
Example #29
0
    def test_GLM_convergence_1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 50,  'cD', 300),
            (100, 100, 'cE', 300),
            (100, 200, 'cF', 300),
            (100, 300, 'cG', 300),
            (100, 400, 'cH', 300),
            (100, 500, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        USEKNOWNFAILURE = True
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            if USEKNOWNFAILURE:
                csvFilename = 'failtoconverge_100x50.csv'
                csvPathname = h2o.find_file('smalldata/logreg/' + csvFilename)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                    'max_iter': 10, 
                    'weight': 1.0,
                    'link': 'familyDefault',
                    'n_folds': 2,
                    'beta_epsilon': 1e-4,
                    'lambda': '1e-8:1e-3:1e2',
                    'alpha': '0,0.5,.75',
                    'thresholds': '0,1,0.2'
                    }

            if USEKNOWNFAILURE:
                kwargs['y'] = 50
            else:
                kwargs['y'] = y

            emsg = None
            for i in range(2):
                start = time.time()
                # get rid of the Jstack polling
                glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
                print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds'
                # we can pass the warning, without stopping in the test, so we can 
                # redo it in the browser for comparison
                warnings = h2o_glm.simpleCheckGLMGrid(self, glm, None, allowFailWarning=True, **kwargs)

                # gets the failed to converge, here, after we see it in the browser too
                x = re.compile("[Ff]ailed")
                if warnings:
                    for w in warnings:
                        if (re.search(x,w)): 
                            # first
                            if emsg is None: emsg = w
                            print w
                if emsg: break
        
            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLMGridProgress")
                time.sleep(5)

            # gets the failed to converge, here, after we see it in the browser too
            if emsg is not None:
                raise Exception(emsg)
Example #30
0
    def test_GLMGrid_covtype_many(self):
        csvFilename = "covtype.data"
        csvPathname = "UCI/UCI-large/covtype/" + csvFilename
        parseResult = h2i.import_parse(bucket="datasets", path=csvPathname, schema="put", timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
        print "\n" + csvPathname, "    num_rows:", "{:,}".format(inspect["num_rows"]), "    num_cols:", "{:,}".format(
            inspect["num_cols"]
        )

        x = ""

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            "x": x,
            "y": y,
            "family": "binomial",
            "link": "logit",
            "n_folds": 2,
            "case_mode": "=",
            "case": 1,
            "max_iter": max_iter,
            "beta_eps": 1e-3,
            "lambda": "0,0.5,0.8",
            "alpha": "0,1e-8,1e-4",
            "parallel": 1,
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            GLMResult = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)

            # print "GLMResult:", h2o.dump_json(GLMResult)
            job_key = GLMResult["response"]["redirect_request_args"]["job"]
            model_key = GLMResult["response"]["redirect_request_args"]["destination_key"]
            jobs.append((job_key, model_key))
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(10):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(
                bucket="datasets",
                path=csvPathname,
                schema="put",
                src_key=src_key,
                hex_key=hex_key,
                timeoutSecs=10,
                noPoll=True,
                doSummary=False,
            )

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        for job_key, model_key in jobs:
            GLMResult = h2o.nodes[0].GLMGrid_view(job=job_key, destination_key=model_key)
            h2o_glm.simpleCheckGLMGrid(self, GLMResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs
Example #31
0
    def test_GLMGrid_covtype_many(self):
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put',
                                       timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        x = ""

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'x': x,
            'y': y,
            'family': 'binomial',
            'link': 'logit',
            'n_folds': 2,
            'case_mode': '=',
            'case': 1,
            'max_iter': max_iter,
            'beta_eps': 1e-3,
            'lambda': '0,0.5,0.8',
            'alpha': '0,1e-8,1e-4',
            'parallelism': 1,
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            GLMResult = h2o_cmd.runGLMGrid(parseResult=parseResult,
                                           timeoutSecs=300,
                                           noPoll=True,
                                           **kwargs)

            # print "GLMResult:", h2o.dump_json(GLMResult)
            job_key = GLMResult['response']['redirect_request_args']['job']
            model_key = GLMResult['response']['redirect_request_args'][
                'destination_key']
            jobs.append((job_key, model_key))
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(10):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='put',
                                           src_key=src_key,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           noPoll=True,
                                           doSummary=False)

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        for job_key, model_key in jobs:
            GLMResult = h2o.nodes[0].GLMGrid_view(job=job_key,
                                                  destination_key=model_key)
            h2o_glm.simpleCheckGLMGrid(self, GLMResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs