Esempio n. 1
0
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10)
    y = "10"
    x = ""
    # Took num_cross_validation_folds out, because GLM doesn't include num_cross_validation_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values, 1 and -1. need to use case for one of them
    kwargs = {'x': x, 'y':  y, 'case': -1}

    start = time.time()
    glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
    print "GLM in",  (time.time() - start), "secs (python measured)"
    h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glm['GLMModel']
    validationsList = glm['GLMModel']['validations']
    validations = validationsList[0]
    # validations['err']

    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = ["covtype.data"]
        else:
            csvFilenameList = [
                "covtype200x.data",
                "covtype200x.data",
                "covtype.data",
                "covtype.data",
                "covtype20x.data",
                "covtype20x.data",
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = "/home/0xdiag/datasets/standard"
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60)
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey["destination_key"])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm["GLMModel"]
            coefficients = GLMModel["coefficients"]
            validationsList = GLMModel["validations"]
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, "err", validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write(".")
            sys.stdout.flush()
Esempio n. 3
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30)
    y = "10"
    x = ""
    # Took n_folds out, because GLM doesn't include n_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values 1,-1. need to specify case
    kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}

    start = time.time()
    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
    print "GLM in",  (time.time() - start), "secs (python)"
    h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glm['GLMModel']
    validationsList = glm['GLMModel']['validations']
    validations = validationsList[0]
    # validations['err']

    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
Esempio n. 4
0
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                 key2=csvFilename + ".hex",
                                 timeoutSecs=10)
    y = "10"
    x = ""
    # Took n_folds out, because GLM doesn't include n_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values, 1 and -1. need to use case for one of them
    kwargs = {'x': x, 'y': y, 'case': -1}

    start = time.time()
    glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                             timeoutSecs=timeoutSecs,
                             **kwargs)
    print "GLM in", (time.time() - start), "secs (python measured)"
    h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glm['GLMModel']
    validationsList = glm['GLMModel']['validations']
    validations = validationsList[0]
    # validations['err']

    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
Esempio n. 5
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket,
                                   path=csvPathname,
                                   hex_key=csvFilename + ".hex",
                                   schema='put',
                                   timeoutSecs=10)
    y = 10
    # Took n_folds out, because GLM doesn't include n_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values, 1 and -1. need to use case for one of them
    kwargs = {'response': y, 'alpha': 0, 'family': 'binomial'}

    h2o.nodes[0].to_enum(src_key=parseResult['destination_key'],
                         column_index=y + 1)

    start = time.time()
    glm = h2o_cmd.runGLM(parseResult=parseResult,
                         timeoutSecs=timeoutSecs,
                         **kwargs)
    print "GLM in", (time.time() - start), "secs (python measured)"
    h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    glm_model = glm['glm_model']
    validation = glm_model['submodels'][0]['validation']

    if self.validation1:
        h2o_glm.compareToFirstGlm(self, 'auc', validation, self.validation1)
    else:
        self.validation1 = copy.deepcopy(validation)
Esempio n. 6
0
def glm_score(self, csvFilename, bucket, csvPathname, modelKey, modelPathname, timeoutSecs=30, pollTimeoutSecs=30):
    print "\nStarting GLM score of", csvFilename
    hex_key = csvFilename + ".hex"
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs)
    y = "10"


    # save and restore the model
    h2o.nodes[0].save_model(model=modelKey, path=modelPathname, force=1)
    # FIX! should we remove the existing key to make sure it loads? really should try both cases (existing or not)
    h2o.nodes[0].load_model(path=modelPathname)

    start = time.time()
    glmScore = h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, 
        vactual=y, vpredict=1, expectedAuc=0.5, doAUC=False)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))

    # compare this glm to the first one. since the files are replications, 
    # the results
    # should be similar?
    # UPDATE: format for returning results is slightly different than normal GLM
    if self.glmScore1:
        h2o_glm.compareToFirstGlm(self, 'mse', glmScore, self.glmScore1)
    else:
        self.glmScore1 = copy.deepcopy(glmScore)
Esempio n. 7
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, 
        hex_key=csvFilename + ".hex", schema='put', timeoutSecs=10)
    y = 10
    # Took n_folds out, because GLM doesn't include n_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values, 1 and -1. need to use case for one of them
    kwargs = {'response':  y, 'alpha': 0, 'family': 'binomial'}

    h2o.nodes[0].to_enum(src_key=parseResult['destination_key'], column_index=y+1)

    start = time.time()
    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
    print "GLM in",  (time.time() - start), "secs (python measured)"
    h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    glm_model = glm['glm_model']
    validation = glm_model['submodels'][0]['validation']

    if self.validation1:
        h2o_glm.compareToFirstGlm(self, 'auc', validation, self.validation1)
    else:
        self.validation1 = copy.deepcopy(validation)
Esempio n. 8
0
def glm_score(self, csvFilename, csvPathname, modelKey, thresholds="0.5",
    timeoutSecs=30, pollTimeoutSecs=30):
    print "\nStarting GLM score of", csvFilename
    key2 = csvFilename + ".hex"
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs)
    y = "10"
    x = ""
    kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}

    start = time.time()
    glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, thresholds="0.5",
        timeoutSecs=timeoutSecs)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))
    ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, 
    # the results
    # should be similar?
    # UPDATE: format for returning results is slightly different than normal GLM
    validation = glmScore['validation']
    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validation, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validation)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'covtype.data',
                ]
        else:
            csvFilenameList = [
                'covtype200x.data',
                'covtype200x.data',
                'covtype.data',
                'covtype.data',
                'covtype20x.data',
                'covtype20x.data',
                ]

        # a browser window too, just because we can
        ## h2b.browseTheCloud()
        importFolderPath = "standard"
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 10
0
def glm_score(self, csvFilename, bucket, csvPathname, modelKey, modelPathname, timeoutSecs=30, pollTimeoutSecs=30):
    print "\nStarting GLM score of", csvFilename
    hex_key = csvFilename + ".hex"
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs)
    y = "10"


    # save and restore the model
    h2o.nodes[0].save_model(model=modelKey, path=modelPathname, force=1)
    # FIX! should we remove the existing key to make sure it loads? really should try both cases (existing or not)
    h2o.nodes[0].load_model(path=modelPathname)

    start = time.time()
    glmScore = h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, 
        vactual=y, vpredict=1, expectedAuc=0.5, doAUC=False)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))

    # compare this glm to the first one. since the files are replications, 
    # the results
    # should be similar?
    # UPDATE: format for returning results is slightly different than normal GLM
    if self.glmScore1:
        h2o_glm.compareToFirstGlm(self, 'mse', glmScore, self.glmScore1)
    else:
        self.glmScore1 = copy.deepcopy(glmScore)
Esempio n. 11
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket,
                                   path=csvPathname,
                                   hex_key=csvFilename + ".hex",
                                   schema='put',
                                   timeoutSecs=30)
    y = "10"
    x = ""
    # Took n_folds out, because GLM doesn't include n_folds time and it's slow
    # wanted to compare GLM time to my measured time
    # hastie has two values 1,-1. need to specify case
    kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5}

    start = time.time()
    glm = h2o_cmd.runGLM(parseResult=parseResult,
                         timeoutSecs=timeoutSecs,
                         **kwargs)
    print "GLM in", (time.time() - start), "secs (python)"
    h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glm['GLMModel']
    validationsList = glm['GLMModel']['validations']
    validations = validationsList[0]
    # validations['err']

    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
Esempio n. 12
0
    def test_GLM_catdata_hosts(self):
        # these are still in /home/kevin/scikit/datasets/logreg
        # FIX! just two for now..
        csvFilenameList = [
            "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz"
        ]

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # save the first, for all comparisions, to avoid slow drift with each iteration
        validations1 = {}
        for csvFilename in csvFilenameList:
            csvPathname = h2o.find_file('smalldata/' + csvFilename)
            # I use this if i want the larger set in my localdir
            # csvPathname = h2o.find_file('/home/kevin/scikit/datasets/logreg/' + csvFilename)

            print "\n" + csvPathname

            start = time.time()
            # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included
            # 0 by itself is okay?
            kwargs = {
                'y': 7,
                'x': '1,2,3,4,5,6',
                'family': "binomial",
                'n_folds': 3,
                'lambda': 1e-4
            }
            timeoutSecs = 200
            glm = h2o_cmd.runGLM(csvPathname=csvPathname,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 6, **kwargs)

            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            validationsList = glm['GLMModel']['validations']
            print validationsList
            validations = validationsList[0]

            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations,
                                          validations1)
            else:
                validations1 = copy.deepcopy(validations)

            sys.stdout.write('.')
            sys.stdout.flush()
Esempio n. 13
0
    def test_GLM2_from_import_hosts(self):
        h2o.beta_features = True
        csvFilenameList = [
            'covtype.data',
            'covtype20x.data',
        ]

        # a browser window too, just because we can
        ## h2b.browseTheCloud()
        importFolderPath = "standard"
        validation1 = {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key='A.hex',
                                           timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            case = 1
            y = 54
            execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, case)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

            kwargs = {'response': y, 'n_folds': 2, 'family': "binomial"}
            glm = h2o_cmd.runGLM(parseResult={'destination_key': 'A.hex'},
                                 timeoutSecs=2000,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            # compare this glm to the first one. since the files are replications, the results
            # should be similar?
            validation = glm['glm_model']['submodels'][0]['validation']

            if validation1:
                h2o_glm.compareToFirstGlm(self, 'auc', validation, validation1)
            else:
                validation1 = copy.deepcopy(validation)
Esempio n. 14
0
    def test_GLM2_catdata_hosts(self):
        h2o.beta_features = True
        # these are still in /home/kevin/scikit/datasets/logreg
        # FIX! just two for now..
        csvFilenameList = [
            "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz"
        ]

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        # save the first, for all comparisions, to avoid slow drift with each iteration
        validation1 = {}
        for csvFilename in csvFilenameList:
            csvPathname = csvFilename
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=csvPathname,
                                           schema='put')
            print "\n" + csvPathname

            start = time.time()
            # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included
            # 0 by itself is okay?
            kwargs = {
                'response': 7,
                'family': "binomial",
                'n_folds': 3,
                'lambda': 1e-4
            }
            timeoutSecs = 200
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 'C7', **kwargs)

            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            # compare this glm to the first one. since the files are replications, the results
            # should be similar?
            validation = glm['glm_model']['submodels'][0]['validation']

            if validation1:
                h2o_glm.compareToFirstGlm(self, 'auc', validation, validation1)
            else:
                validation1 = copy.deepcopy(validation)
Esempio n. 15
0
    def test_GLM_catdata_hosts(self):
        # these are still in /home/kevin/scikit/datasets/logreg
        # FIX! just two for now..
        csvFilenameList = [
            "1_100kx7_logreg.data.gz",
            "2_100kx7_logreg.data.gz"
        ]

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        # save the first, for all comparisions, to avoid slow drift with each iteration
        validations1 = {}
        for csvFilename in csvFilenameList:
            csvPathname = h2o.find_file('smalldata/' + csvFilename)
            # I use this if i want the larger set in my localdir
            # csvPathname = h2o.find_file('/home/kevin/scikit/datasets/logreg/' + csvFilename)

            print "\n" + csvPathname

            start = time.time()
            # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included
            # 0 by itself is okay?
            kwargs = {'y': 7, 'x': '1,2,3,4,5,6', 'family': "binomial", 'num_cross_validation_folds': 3, 'lambda': 1e-4}
            timeoutSecs = 200
            glm = h2o_cmd.runGLM(csvPathname=csvPathname, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 6, **kwargs)

            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            validationsList = glm['GLMModel']['validations']
            print validationsList
            validations = validationsList[0]

            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 16
0
    def test_GLM2_from_import_hosts(self):
        h2o.beta_features = True
        csvFilenameList = [
            'covtype.data',
            'covtype20x.data',
            ]

        # a browser window too, just because we can
        ## h2b.browseTheCloud()
        importFolderPath = "standard"
        validation1= {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key='A.hex', timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            case = 1
            y = 54
            execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, case)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

            kwargs = {'response': y, 'n_folds': 2, 'family': "binomial"}
            glm = h2o_cmd.runGLM(parseResult={'destination_key': 'A.hex'}, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            # compare this glm to the first one. since the files are replications, the results
            # should be similar?
            validation = glm['glm_model']['submodels'][0]['validation']

            if validation1:
                h2o_glm.compareToFirstGlm(self, 'auc', validation, validation1)
            else:
                validation1 = copy.deepcopy(validation)
Esempio n. 17
0
    def test_GLM_catdata_hosts(self):
        # these are still in /home/kevin/scikit/datasets/logreg
        # FIX! just two for now..
        csvFilenameList = [
            "1_100kx7_logreg.data.gz",
            "2_100kx7_logreg.data.gz"
        ]

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        # save the first, for all comparisions, to avoid slow drift with each iteration
        validations1 = {}
        for csvFilename in csvFilenameList:
            csvPathname = csvFilename
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
            print "\n" + csvPathname

            start = time.time()
            # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included
            # 0 by itself is okay?
            kwargs = {'y': 7, 'x': '1,2,3,4,5,6', 'family': "binomial", 'n_folds': 3, 'lambda': 1e-4}
            timeoutSecs = 200
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 'C7', **kwargs)

            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            validationsList = glm['GLMModel']['validations']
            print validationsList
            validations = validationsList[0]

            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)
Esempio n. 18
0
    def test_GLM2_catdata_hosts(self):
        h2o.beta_features = True
        # these are still in /home/kevin/scikit/datasets/logreg
        # FIX! just two for now..
        csvFilenameList = [
            "1_100kx7_logreg.data.gz",
            "2_100kx7_logreg.data.gz"
        ]

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        # save the first, for all comparisions, to avoid slow drift with each iteration
        validation1 = {}
        for csvFilename in csvFilenameList:
            csvPathname = csvFilename
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
            print "\n" + csvPathname

            start = time.time()
            # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included
            # 0 by itself is okay?
            kwargs = {'response': 7, 'family': "binomial", 'n_folds': 3, 'lambda': 1e-4}
            timeoutSecs = 200
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, 'C7', **kwargs)

            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            ### h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            # compare this glm to the first one. since the files are replications, the results
            # should be similar?
            validation = glm['glm_model']['submodels'][0]['validation']

            if validation1:
                h2o_glm.compareToFirstGlm(self, 'auc', validation, validation1)
            else:
                validation1 = copy.deepcopy(validation)
Esempio n. 19
0
def glm_score(self, csvFilename, csvPathname, modelKey, timeoutSecs=3):
    print "\nStarting GLM score of", csvFilename
    key2 = csvFilename + ".hex"
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10)
    y = "10"
    x = ""
    kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}

    start = time.time()
    glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, timeoutSecs=timeoutSecs)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))
    ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glmScore['GLMModel']
    validationsList = glmScore['GLMModel']['validations']
    validations = validationsList[0]
    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    # we can force a col type to enum now? with param columnTypes
    # "Numeric"
    # make the last column enum
    # Instead of string for parse, make this a dictionary, with column index, value
    # that's used for updating the ColumnTypes array before making it a string for parse
    columnTypeDict = {10: 'Enum'}
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict,
        hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30)
    pA = h2o_cmd.ParseObj(parseResult)
    iA = h2o_cmd.InspectObj(pA.parse_key)
    parse_key = pA.parse_key
    numRows = iA.numRows
    numCols = iA.numCols
    labelList = iA.labelList
    for i in range(10):
        print "Summary on column", i
        # FIX! how come only 0 works here for column
        co = h2o_cmd.runSummary(key=parse_key, column=i)
        for k,v in co:
            print k, v

    expected = []
    allowedDelta = 0

    labelListUsed = list(labelList)
    labelListUsed.remove('C11')
    numColsUsed = numCols - 1

    parameters = {
        'validation_frame': parse_key,
        'ignored_columns': None,
        # FIX! for now just use a column that's binomial
        'response_column': 'C11',
        # FIX! when is this needed? redundant for binomial?
        'balance_classes': False,
        'max_after_balance_size': None,
        'standardize': False,
        'family': 'binomial', 
        'link': None, 
        'tweedie_variance_power': None,
        'tweedie_link_power': None,
        'alpha': '[1e-4]',
        'lambda': '[0.5,0.25, 0.1]',
        'prior1': None,
        'lambda_search': None,
        'nlambdas': None,
        'lambda_min_ratio': None,
        'use_all_factor_levels': False,
        'n_folds': 1,
    }


    start = time.time()
    model_key = 'hastie_glm.hex'
    bmResult = h2o.n0.build_model(
        algo='glm',
        destination_key=model_key,
        training_frame=parse_key,
        parameters=parameters,
        timeoutSecs=60)
    bm = OutputObj(bmResult, 'bm')

    modelResult = h2o.n0.models(key=model_key)
    model = OutputObj(modelResult['models'][0]['output'], 'model')

    h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)

    cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
    cmm = OutputObj(cmmResult, 'cmm')

    mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
    mm = OutputObj(mmResult, 'mm')

    prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
    pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    if self.validation1:
        h2o_glm.compareToFirstGlm(self, 'AUC', validation, self.validation1)
    else:
        # self.validation1 = copy.deepcopy(validation)
        self.validation1 = None
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'covtype.data',
                ]
        else:
            csvFilenameList = [
                'covtype200x.data',
                'covtype200x.data',
                'covtype.data',
                'covtype.data',
                'covtype20x.data',
                'covtype20x.data',
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Esempio n. 22
0
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'covtype.data',
            ]
        else:
            csvFilenameList = [
                'covtype200x.data',
                'covtype200x.data',
                'covtype.data',
                'covtype.data',
                'covtype20x.data',
                'covtype20x.data',
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=2000,
                                     **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations,
                                          validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients,
                                          coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush()
Esempio n. 23
0
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'covtype.data',
            ]
        else:
            csvFilenameList = [
                'covtype200x.data',
                'covtype200x.data',
                'covtype.data',
                'covtype.data',
                'covtype20x.data',
                'covtype20x.data',
            ]

        # a browser window too, just because we can
        ## h2b.browseTheCloud()
        importFolderPath = "standard"
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=2000,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)

            GLMModel = glm['GLMModel']
            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations,
                                          validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, 'C1', coefficients,
                                          coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush()
Esempio n. 24
0
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    # we can force a col type to enum now? with param columnTypes
    # "Numeric"
    # make the last column enum
    # Instead of string for parse, make this a dictionary, with column index, value
    # that's used for updating the ColumnTypes array before making it a string for parse
    columnTypeDict = {10: 'Enum'}
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict,
        hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30)
    pA = h2o_cmd.ParseObj(parseResult)
    iA = h2o_cmd.InspectObj(pA.parse_key)
    parse_key = pA.parse_key
    numRows = iA.numRows
    numCols = iA.numCols
    labelList = iA.labelList
    for i in range(10):
        print "Summary on column", i
        # FIX! how come only 0 works here for column
        co = h2o_cmd.runSummary(key=parse_key, column=i)
        for k,v in co:
            print k, v

    expected = []
    allowedDelta = 0

    labelListUsed = list(labelList)
    labelListUsed.remove('C11')
    numColsUsed = numCols - 1

    parameters = {
        'validation_frame': parse_key,
        'ignored_columns': None,
        # FIX! for now just use a column that's binomial
        'response_column': 'C11',
        # FIX! when is this needed? redundant for binomial?
        'balance_classes': False,
        'max_after_balance_size': None,
        'standardize': False,
        'family': 'binomial', 
        'link': None, 
        'alpha': '[1e-4]',
        'lambda': '[0.5,0.25, 0.1]',
        'lambda_search': None,
        'nlambdas': None,
        'lambda_min_ratio': None,
        'use_all_factor_levels': False,
    }


    start = time.time()
    model_key = 'hastie_glm.hex'
    bmResult = h2o.n0.build_model(
        algo='glm',
        model_id=model_key,
        training_frame=parse_key,
        parameters=parameters,
        timeoutSecs=60)
    bm = OutputObj(bmResult, 'bm')

    modelResult = h2o.n0.models(key=model_key)
    model = OutputObj(modelResult['models'][0]['output'], 'model')

    h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)

    cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
    cmm = OutputObj(cmmResult, 'cmm')

    mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
    mm = OutputObj(mmResult, 'mm')

    prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
    pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    if self.validation1:
        h2o_glm.compareToFirstGlm(self, 'AUC', validation, self.validation1)
    else:
        # self.validation1 = copy.deepcopy(validation)
        self.validation1 = None