def test_parse_covtype_2(self):

        tryList = [
            ('covtype.data', 1, 30),
            # ('covtype20x.data', 20, 120),
        ]

        for (csvFilename, multiplyExpected, timeoutSecs) in tryList:

            for trial in range(16,24):
                # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv"))
                importFolderPath = "standard"
                hex_key = 'covtype.hex'
                csvPathname = importFolderPath + "/" + csvFilename
                chunk_size = 2**trial
                print "Trial %s. Trying chunk_size %s (power of 2)" % (trial, chunk_size)

                parseResult  = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', 
                    timeoutSecs=timeoutSecs, hex_key=hex_key,
                    chunk_size=chunk_size, doSummary=False)
                pA = h2o_cmd.ParseObj(parseResult)
                iA = h2o_cmd.InspectObj(pA.parse_key)
                print iA.missingList, iA.labelList, iA.numRows, iA.numCols

                for i in range(1):
                    co = h2o_cmd.runSummary(key=hex_key, column=i)

                k = parseResult['frames'][0]['frame_id']['name']
                # print "parseResult:", dump_json(parseResult)
                a_node = h2o.nodes[0]
                frames_result = a_node.frames(key=k, row_count=5)
                # print "frames_result from the first parseResult key", dump_json(frames_result)
                
                parseKeyIndexedCheck(frames_result, multiplyExpected)
    def test_parse_nfs(self):
        print "run as user 0xcustomer on machine with nfs /mnt/0xcustomer-datasets/c1"
        tryList = [
            ('iris2.csv', 'iris2.hex', 1, 30),
        ]

        for (csvFilename, hex_key, multiplyExpected, timeoutSecs) in tryList:
            importFolderPath = "/mnt/0xcustomer-datasets/c1"
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='local',
                                           timeoutSecs=timeoutSecs,
                                           hex_key=hex_key,
                                           chunk_size=4194304 / 2,
                                           doSummary=False)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=150 * multiplyExpected,
                                    expectedNumCols=5,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(0):
                print "Summary on column", i
                co = h2o_cmd.runSummary(key=hex_key, column=i)

            k = parseResult['frames'][0]['frame_id']['name']
            frames_result = h2o.nodes[0].frames(key=k, row_count=5)
            # print "frames_result from the first parseResult key", dump_json(frames_result)
            parseKeyIndexedCheck(frames_result, multiplyExpected)
    def test_delete_all_keys(self):
        # FIX! should have some model keys in here too, from RF etc.
        importFolderPath = 'standard'
        timeoutSecs = 500

        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        for trial in range(2):
            for csvFilename in csvFilenameList:
                csvPathname = importFolderPath + "/" + csvFilename

                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                               path=csvPathname,
                                               timeoutSecs=500)
                pA = h2o_cmd.ParseObj(parseResult)
                iA = h2o_cmd.InspectObj(pA.parse_key)
                parse_key = pA.parse_key
                numRows = iA.numRows
                numCols = iA.numCols
                labelList = iA.labelList

                h2i.delete_keys_at_all_nodes()
                print "Delete all keys. Shouldn't be any more?"
                h2o.nodes[0].remove_all_keys()

            print "\nTrial", trial, "completed\n"
    def test_parse_100k_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 1000, 'cA', 200, 200),
            (10, 2000, 'cA', 200, 200),
            (10, 4000, 'cA', 200, 200),
            (10, 8000, 'cA', 200, 200),
            (10, 9000, 'cA', 200, 200),
            (10, 10000, 'cA', 200, 200),
            (10, 100000, 'cA', 200, 200),
            # (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(
                path=csvPathname,
                schema='local',
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                doSummary=False,
                columnNames=None,
                intermediateResults=DO_INTERMEDIATE_RESULTS)

            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            print "Skipping the delete keys for now"
            if 1 == 0:
                # if not h2o.browse_disable:
                #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                #    time.sleep(5)
                h2i.delete_keys_at_all_nodes()
    def test_split_frame(self):

        csvFilename = 'iris.csv'
        csvPathname = 'iris/' + csvFilename
        hex_key = "iris.hex"

        parseResultA = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname,
                                        hex_key=hex_key,
                                        timeoutSecs=10)

        print "Just split away and see if anything blows up"
        splitMe = hex_key

        pA = h2o_cmd.ParseObj(parseResultA)
        print pA.numRows
        print pA.numCols
        print pA.parse_key

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        iA = h2o_cmd.InspectObj(splitMe)
        origNumRows = iA.numRows
        origNumCols = iA.numCols
        for s in range(10):
            iA = h2o_cmd.InspectObj(splitMe)
            numRows = iA.numRows

            fsResult = h2o.n0.split_frame(dataset=splitMe, ratios='[0.5]')
            fs = OutputObj(fsResult, 'split_frame')
            d = fs.jobs[0].destination_frames

            # modelResult = h2o.n0.models(key=model_key)
            # model = OutputObj(modelResult['models'][0]['output'], 'split_frame')
            # print "model:", dump_json(model)
            split_keys = [split.name for split in d]

            # modelResult = h2o.n0.models(key=model_key)
            # model = OutputObj(modelResult['models'][0]['output'], 'split_frame')
            # print "model:", dump_json(model)
            # split_keys = [split._key.name for split in model.splits]

            iB = h2o_cmd.InspectObj(split_keys[0])
            iC = h2o_cmd.InspectObj(split_keys[1])

            numCols = iB.numCols
            split0_rows = iB.numRows
            split1_rows = iC.numRows

            # print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split_keys[1]
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
Example #6
0
    def test_frame_split(self):

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResultA = h2i.import_parse(bucket='home-0xdiag-datasets',
                                        path=csvPathname,
                                        hex_key=hex_key,
                                        timeoutSecs=20)
        pA = h2o_cmd.ParseObj(parseResultA)
        print pA.numRows
        print pA.numCols
        print pA.parse_key

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        iA = h2o_cmd.InspectObj(splitMe)
        origNumRows = iA.numRows
        origNumCols = iA.numCols
        for s in range(20):
            iA = h2o_cmd.InspectObj(splitMe)
            numRows = iA.numRows

            fsResult = h2o.n0.frame_split(training_frame=splitMe,
                                          ratios='[0.5]')
            fs = OutputObj(fsResult, 'frame_split')
            model_key = fs.jobs[0].dest.name

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'],
                              'frame_split')
            # print "model:", dump_json(model)
            split_keys = [split._key.name for split in model.splits]

            iB = h2o_cmd.InspectObj(split_keys[0])
            iC = h2o_cmd.InspectObj(split_keys[1])

            numCols = iB.numCols
            split0_rows = iB.numRows
            split1_rows = iC.numRows

            # print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split_keys[1]
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
            if split1_rows <= 1:
                break
Example #7
0
    def test_exec2_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets',
                                                  'standard/covtype.data',
                                                  returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            hex_key=hex_key)
            pA = h2o_cmd.ParseObj(parseResultA)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            iA = h2o_cmd.InspectObj(pA.parse_key)

            k = Key(hex_key)
            colResultList = []
            for i in range(pA.numCols):
                result = Expr(Fcn('sum', k[:, i], True)).result
                colResultList.append(result)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
    def test_parse_syn_gz_cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # summary fails with 100000 cols
            # overwrite the key each time to save space?
            (100, 100, 'cF', 600),
            (100, 5000, 'cF', 600),
            (100, 10000, 'cF', 600),
            # (100, 12000, 'cF', 600),
            # (100, 15000, 'cF', 600),
            # (100, 17000, 'cF', 600),
            (100, 20000, 'cF', 600),
            (100, 40000, 'cF', 600),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            csvFilenamegz = csvFilename + ".gz"
            csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
            h2o_util.file_gzip(csvPathname, csvPathnamegz)

            parseResult = h2i.import_parse(path=csvPathnamegz,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=DOSUMMARY)

            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
    def test_parse_covtype(self):

        tryList = [
            ('covtype.data', 1, 30),
            ('covtype20x.data', 20, 120),
        ]

        for (csvFilename, multiplyExpected, timeoutSecs) in tryList:
            # h2o-dev doesn't take ../.. type paths? make find_file return absolute pathj
            a_node = h2o.nodes[0]

            importFolderPath = os.path.expanduser(
                "~/home-0xdiag-datasets/standard")
            csvPathname = importFolderPath + "/" + csvFilename
            importResult = a_node.import_files(path=csvPathname)

            # print "importResult:", dump_json(importResult)
            hex_key = importResult['destination_frames'][0]

            if CAUSE_FAIL:
                frames_result = a_node.frames(key=k,
                                              row_count=5,
                                              timeoutSecs=timeoutSecs)
            # print "frames_result from the first importResult key", dump_json(frames_result)

            parseResult = a_node.parse(key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       chunk_size=4194304 * 4)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=581012 * multiplyExpected,
                                    expectedNumCols=55,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(0):
                print "Summary on column", i
                co = h2o_cmd.runSummary(key=hex_key, column=i)

            k = parseResult['frames'][0]['frame_id']['name']
            # print "parseResult:", dump_json(parseResult)
            frames_result = a_node.frames(key=k, row_count=5)
            # print "frames_result from the first parseResult key", dump_json(frames_result)

            parseKeyIndexedCheck(frames_result, multiplyExpected)
    def test_plot_remove_keys(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 100, 'cG', 400),
            (200000, 100, 'cH', 400),
            (400000, 100, 'cI', 400),
            (800000, 100, 'cJ', 400),
            (1000000, 100, 'cK', 400),
        ]

        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES - 1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount,
                                                   colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              sel)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parseElapsed = pA.python_elapsed
            parse_key = pA.parse_key
            byteSize = pA.byteSize
            numRows = iA.numRows
            numCols = iA.numCols
            print parse_key, parseElapsed, byteSize, numRows, numCols

            labelList = iA.labelList
            node = h2o.nodes[0]

            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(byteSize)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'byteSize'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Example #11
0
    def test_GLM_basic_1(self):
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        labelListUsed.remove('STR')
        labelListUsed.remove('FNDX')  # response removed also
        numColsUsed = numCols - 2
        for trial in range(1):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?

            # glm parameters:

            # model_id Key<Model> False None []
            # training_frame Key<Frame> False None []
            # validation_frame Key<Frame> False None []
            # ignored_columns string[] False None []
            # drop_na20_cols boolean False False []
            # score_each_iteration boolean False False []
            # response_column VecSpecifier False None []
            # balance_classes boolean False False []
            # class_sampling_factors float[] False None []
            # max_after_balance_size float False 5.0 []
            # max_confusion_matrix_size int False 20 []
            # max_hit_ratio_k int False 10 []
            # family enum False gaussian [u'gaussian', u'binomial', u'poisson', u'gamma']
            # solver enum False IRLSM [u'AUTO', u'IRLSM', u'L_BFGS']

            # alpha double[] False None []

            # lambda double[] False None []
            # lambda_search boolean False False []
            # lambda_min_ratio double False -1.0 []
            # nlambdas int False -1 []

            # standardize boolean False True []
            # max_iterations int False -1 []
            # beta_epsilon double False 0.0001 []
            # link enum False family_default [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # prior double False -1.0 []
            # use_all_factor_levels boolean False False []
            # beta_constraints Key<Frame> False None []
            # max_active_predictors int False -1 []

            parameters = {
                'ignored_columns': '["STR"]',
                'response_column': 'FNDX',
                # FIX! when is this needed? redundant for binomial?
                'balance_classes': False,
                'max_after_balance_size': None,
                'standardize': False,
                'family': 'binomial',
                'link': None,
                'alpha': '[1e-4]',
                'lambda': '[0.5]',
                'prior1': None,
                'lambda_search': None,
                'nlambdas': None,
                'lambda_min_ratio': None,
                # 'use_all_factor_levels': False,
            }

            model_key = 'benign_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data},
                             'mcms')
            m1 = mcms.data[1:]
            h0 = mcms.data[0]
            print "\nmcms", tabulate(m1, headers=h0)

            thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
            if 1 == 0:
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')
                print ""
                for i, c in enumerate(cmms.cm):
                    print "\ncmms.cm[%s]" % i, tabulate(c)
                print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
Example #12
0
    def test_w2v_basic_1(self):
        global SYNDATASETS_DIR
        SYNDATASETS_DIR = h2o.make_syn_dir()
        n = 500000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:

            csvPathname = create_file_with_seps(rowCount, colCount)

            # just parse to make sure it's good
            parseResult = h2i.import_parse(path=csvPathname,
                                           checkHeader=1,
                                           delete_on_done=0,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            cA = h2o_test.OutputObj(iA.columns[0], "inspect_column")

            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            for i in range(colCount):
                print cA.type, cA.missing
                self.assertEqual(
                    0, cA.missing,
                    "Column %s Expected %s. missing: %s is incorrect" %
                    (i, 0, cA.missing))
                self.assertEqual(
                    'string', cA.type,
                    "Column %s Expected %s. type: %s is incorrect" %
                    (i, 0, cA.type))

            if DO_SUMMARY:
                for i in range(colCount):
                    co = h2o_cmd.runSummary(key=parse_key, column=i)
                    print co.label, co.type, co.missing, co.domain, sum(
                        co.bins)
                    self.assertEqual(
                        0, co.missing,
                        "Column %s Expected %s. missing: %s is incorrect" %
                        (i, 0, co.missing))
                    self.assertEqual(
                        'String', co.type,
                        "Column %s Expected %s. type: %s is incorrect" %
                        (i, 0, co.type))

            # no cols ignored
            labelListUsed = list(labelList)
            numColsUsed = numCols
            for trial in range(1):

                parameters = {
                    'validation_frame': parse_key,  # KeyIndexed False []
                    'ignored_columns': None,  # string[] None []
                    'score_each_iteration': None,  # boolean false []
                    'minWordFreq': 5,  # int 5 []
                    'wordModel': 'SkipGram',  # enum [u'CBOW', u'SkipGram']
                    'normModel': 'HSM',  # enum # [u'HSM', u'NegSampling']
                    'negSampleCnt': 5,  # int 5 []
                    'vecSize': 100,  # int 100
                    'windowSize': 5,  # int 5
                    'sentSampleRate': 0.001,  # float 0.001
                    'initLearningRate': 0.05,  # float 0.05
                    'epochs': 1,  # int 5
                }

                model_key = 'benign_w2v.hex'
                bmResult = h2o.n0.build_model(algo='word2vec',
                                              destination_key=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=60)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                # not implemented?

                # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
                # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

                h2o_cmd.runStoreView()
Example #13
0
    def test_bayes_basic(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        train_key = 'covtype.train.hex'
        b = Key(train_key)

        model_key = 'bayesModelKey'
        timeoutSecs = 1800
        csvPathname = importFolderPath + "/" + trainFilename

        # FIX! do I need to force enum for classification? what if I do regression after this?
        columnTypeDict = {54: 'Enum'}
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       columnTypeDict=columnTypeDict,
                                       schema='local',
                                       chunk_size=4194304,
                                       hex_key=train_key,
                                       timeoutSecs=timeoutSecs)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        # make 1 thru 6 go to 1
        # change columnTypeDict to None above if I do this
        # Assign(b[:,54], b[:,54]-1)
        # Assign(b[:,54], b[:,54]!=0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        # run through a couple of parameter sets
        parameters = []
        parameters.append({
            'response_column': 'C55',  # still 1-55 on colnames
        })  # just default

        model_key = 'covtype_bayes.hex'

        for p in parameters:
            bmResult = h2o.n0.build_model(algo='naivebayes',
                                          destination_key=model_key,
                                          training_frame=train_key,
                                          validation_frame=train_key,
                                          parameters=p,
                                          timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame']  # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    # we can force a col type to enum now? with param columnTypes
    # "Numeric"
    # make the last column enum
    # Instead of string for parse, make this a dictionary, with column index, value
    # that's used for updating the ColumnTypes array before making it a string for parse
    columnTypeDict = {10: 'Enum'}
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict,
        hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30)
    pA = h2o_cmd.ParseObj(parseResult)
    iA = h2o_cmd.InspectObj(pA.parse_key)
    parse_key = pA.parse_key
    numRows = iA.numRows
    numCols = iA.numCols
    labelList = iA.labelList
    for i in range(10):
        print "Summary on column", i
        # FIX! how come only 0 works here for column
        co = h2o_cmd.runSummary(key=parse_key, column=i)
        for k,v in co:
            print k, v

    expected = []
    allowedDelta = 0

    labelListUsed = list(labelList)
    labelListUsed.remove('C11')
    numColsUsed = numCols - 1

    parameters = {
        'validation_frame': parse_key,
        'ignored_columns': None,
        # FIX! for now just use a column that's binomial
        'response_column': 'C11',
        # FIX! when is this needed? redundant for binomial?
        'balance_classes': False,
        'max_after_balance_size': None,
        'standardize': False,
        'family': 'binomial', 
        'link': None, 
        'tweedie_variance_power': None,
        'tweedie_link_power': None,
        'alpha': '[1e-4]',
        'lambda': '[0.5,0.25, 0.1]',
        'prior1': None,
        'lambda_search': None,
        'nlambdas': None,
        'lambda_min_ratio': None,
        'use_all_factor_levels': False,
        'n_folds': 1,
    }


    start = time.time()
    model_key = 'hastie_glm.hex'
    bmResult = h2o.n0.build_model(
        algo='glm',
        destination_key=model_key,
        training_frame=parse_key,
        parameters=parameters,
        timeoutSecs=60)
    bm = OutputObj(bmResult, 'bm')

    modelResult = h2o.n0.models(key=model_key)
    model = OutputObj(modelResult['models'][0]['output'], 'model')

    h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)

    cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
    cmm = OutputObj(cmmResult, 'cmm')

    mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
    mm = OutputObj(mmResult, 'mm')

    prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
    pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    if self.validation1:
        h2o_glm.compareToFirstGlm(self, 'AUC', validation, self.validation1)
    else:
        # self.validation1 = copy.deepcopy(validation)
        self.validation1 = None
    def test_GLM_error1(self):
        importFolderPath = "covtype"
        csvFilename = "covtype.20k.data"
        hex_key = "covtype20k.hex"
        binomial_key = "covtype20k.b.hex"
        b = Key(hex_key)
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        ## columnTypeDict = {54: 'Enum'}
        columnTypeDict = None
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=binomial_key,
                                       columnTypeDict=columnTypeDict,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        Assign(b[:, 54], b[:, 54] - 1)
        # make 1 thru 6 go to 1
        Assign(b[:, 54], b[:, 54] != 0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        numColsUsed = numCols

        for trial in range(5):
            parameters = {
                'response_column': 'C55',
                'max_iterations': 3,
                'solver': 'L_BFGS',
                'ignored_columns': '["C1"]',
                'alpha': '[0.1]',
                'max_after_balance_size': 1000.0,
                'class_sampling_factors': '[0.2]',
                # 'use_all_factor_levels': None,
                'lambda': '[0]',
            }

            bHack = hex_key

            co = h2o_cmd.runSummary(key=binomial_key, column=54)
            print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)
            co = h2o_cmd.runSummary(key=hex_key, column=54)
            print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)

            model_key = 'rand_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=bHack,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self,
                                   model,
                                   parameters,
                                   labelList,
                                   labelListUsed,
                                   allowNaN=True)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            # FIX! when is this legal
            doClassification = False
            if doClassification:
                mcms = OutputObj(
                    {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
                m1 = mcms.data[1:]
                h0 = mcms.data[0]
                print "\nmcms", tabulate(m1, headers=h0)

            if doClassification:
                thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

                if 1 == 0:
                    print ""
                    for i, c in enumerate(cmms.cm):
                        print "\ncmms.cm[%s]" % i, tabulate(c)
                    print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
    def test_GBMGrid_basic_many(self):
        trainFilename = 'prostate.csv'
        train_key = 'prostate.hex'
        timeoutSecs = 300
        csvPathname = "logreg/" + trainFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=train_key, schema='put')

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        parameters = {
            'validation_frame': train_key,
            'ignored_columns': "['ID']", # this has to have []
            'response_column': 'CAPSULE',
            # 'balance_classes':
            # 'max_after_balance_size':
            # ??
            # 'ntrees': '[8, 10]',
            'ntrees': 8,
            # 'max_depth': '[8, 9]',
            'max_depth': 8,
            # ??
            # 'min_rows': '[1, 2]',
            'min_rows': 1,
            'nbins': 40,
            # ??
            # 'learn_rate': "[0.1, 0.2]",
            'learn_rate': 0.1,
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            # 'variable_importance': False,
            # 'seed': 
        }

        jobs = []
        # kick off 5 of these GBM grid jobs, with different tree choices
        start = time.time()
        totalGBMGridJobs = 0

        for i in range(5):
            modelKey = 'GBMGrid_prostate_%s', i
            bmResult = h2o.n0.build_model(
                algo='gbm',
                destination_key=modelKey,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')
            print "GBMResult:", h2o.dump_json(bm)

            # FIX! is this right for gridded? 
            job_key = bm.jobs[0].key.name
            # FIX! this isn't a full formed name (%)
            model_key = bm.jobs[0].dest.name
            jobs.append( (job_key, model_key) )
            totalGBMGridJobs += 1

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start
        print "All GBM jobs completed in", elapsed, "seconds."
        print "totalGBMGridJobs:", totalGBMGridJobs

        for job_key, model_key in jobs:
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')
            print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n"

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame'] # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
    def test_GLM_many_cols_4(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]
        tryList = [
            (100000, 10, 'cA', 600),
            (100000, 100, 'cA', 600),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              translateList)

            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            expected = []
            allowedDelta = 0

            labelListUsed = list(labelList)
            print "labelListUsed", labelListUsed
            response = labelListUsed[-1]
            labelListUsed.remove(response)
            numColsUsed = numCols - 1
            for trial in range(1):
                # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
                # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
                # can we do classification with probabilities?
                # are only lambda and alpha grid searchable?
                parameters = {
                    'validation_frame': parse_key,
                    'ignored_columns': None,
                    # FIX! for now just use a column that's binomial
                    'response_column': response,  # can't take index now?
                    # FIX! when is this needed? redundant for binomial?
                    'balance_classes': False,
                    'max_after_balance_size': None,
                    'standardize': False,
                    'family': 'binomial',
                    'link': None,
                    'tweedie_variance_power': None,
                    'tweedie_link_power': None,
                    'alpha': '[1e-4]',
                    'lambda': '[0.5,0.25, 0.1]',
                    'prior1': None,
                    'lambda_search': None,
                    'nlambdas': None,
                    'lambda_min_ratio': None,
                    'use_all_factor_levels': False,
                    'n_folds': 1,
                }
                model_key = 'many_cols_glm.hex'
                bmResult = h2o.n0.build_model(algo='glm',
                                              destination_key=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=60)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                       labelListUsed)

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult, 'mm')

                prResult = h2o.n0.predict(model=model_key,
                                          frame=parse_key,
                                          timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'],
                               'pr')
Example #18
0
    def test_DL_basic(self):
        h2o.nodes[0].remove_all_keys()
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # no cols ignored
        labelListUsed = list(labelList)
        labelListUsed.remove('STR')
        numColsUsed = numCols - 1
        for trial in range(1):
            parameters = {
                # required now
                # loss enum True None [u'MeanSquare', u'CrossEntropy']
                'loss': 'CrossEntropy',
                'validation_frame': parse_key,  # KeyIndexed None
                'ignored_columns': '["STR"]',  # string[] None
                'response_column': 'FNDX',  # string None
                'balance_classes': None,  # boolean false
                'max_after_balance_size': None,  # float Infinity
                'keep_cross_validation_splits': None,  # boolean false
                'checkpoint': None,  # Key None
                'overwrite_with_best_model': None,  # boolean true
                'expert_mode': None,  # boolean false
                'autoencoder': None,  # boolean false
                # 'use_all_factor_levels': None, # boolean true
                # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout']
                'activation': None,  # enum Rectifier 
                'hidden': None,  # int[] [200, 200]
                'epochs': None,  # double 10.0
                'train_samples_per_iteration': None,  # long -2
                'target_ratio_comm_to_comp': None,  # double 0.02
                'seed': None,  # long 1679194146842485659
                'adaptive_rate': None,  # boolean true
                'rho': None,  # double 0.99
                'epsilon': None,  # double 1.0E-8
                'rate': None,  # double 0.005
                'rate_annealing': None,  # double 1.0E-6
                'rate_decay': None,  # double 1.0
                'momentum_start': None,  # double 0.0
                'momentum_ramp': None,  # double 1000000.0
                'momentum_stable': None,  # double 0.0
                'nesterov_accelerated_gradient': None,  # boolean true
                'input_dropout_ratio': None,  # double 0.0
                'hidden_dropout_ratios':
                None,  # double[] None (this can grid?)
                'l1': None,  # double 0.0
                'l2': None,  # double 0.0
                'max_w2': None,  # float Infinity
                'initial_weight_distribution':
                None,  # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal']
                'initial_weight_scale': None,  # double 1.0
                'loss':
                None,  # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy']
                'score_interval': None,  # double 5.0
                'score_training_samples': None,  # long 10000
                'score_validation_samples': None,  # long 0
                'score_duty_cycle': None,  # double 0.1
                'classification_stop': None,  # double 0.0
                'regression_stop': None,  # double 1.0E-6
                'quiet_mode': None,  # boolean false
                'max_confusion_matrix_size': None,  # int 20
                'max_hit_ratio_k': None,  # int 10
                'balance_classes': None,  # boolean false
                'class_sampling_factors': None,  # float[] None
                'max_after_balance_size': None,  # float Infinity
                'score_validation_sampling':
                None,  # enum Uniform [u'Uniform', u'Stratified']
                'diagnostics': None,  # boolean true
                'variable_importances': None,  # boolean false
                'fast_mode': None,  # boolean true
                'ignore_const_cols': None,  # boolean true
                'force_load_balance': None,  # boolean true
                'replicate_training_data': None,  # boolean false
                'single_node_mode': None,  # boolean false
                'shuffle_training_data': None,  # boolean false
                'missing_values_handling':
                None,  # enum MeanImputation [u'Skip', u'MeanImputation']
                'sparse': None,  # boolean false
                'col_major': None,  # boolean false
                'average_activation': None,  # double 0.0
                'sparsity_beta': None,  # double 0.0
            }

            model_key = 'benign_dl.hex'
            bmResult = h2o.n0.build_model(algo='deeplearning',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            print "bmResult:", dump_json(bmResult)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            print "model:", dump_json(model)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()
Example #19
0
    def test_billion_rows(self):
        # just do the import folder once
        timeoutSecs = 1500

        csvFilenameAll = [
            # quick test first
            # "covtype.data",
            # then the real thing
            "billion_rows.csv.gz",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path='standard/' + csvFilename,
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            parameters = {
                'response_column': 1,
                'n_folds': 0,
                'alpha': 0,
                'lambda': 0,
            }
            model_key = 'B.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          destination_key=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()

            labelListUsed = labelList
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)
Example #20
0
    def test_GBM_basic(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        train_key = 'covtype.train.hex'
        model_key = 'GBMModelKey'
        timeoutSecs = 1800
        csvPathname = importFolderPath + "/" + trainFilename
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=train_key,
                                       timeoutSecs=timeoutSecs)

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        parameters = {
            'validation_frame': train_key,
            'ignored_columns': None,
            'score_each_iteration': True,
            'response_column': 'C55',
            'do_classification': True,
            # 'balance_classes':
            # 'max_after_balance_size':
            'ntrees': 2,
            'max_depth': 10,
            'min_rows': 3,
            'nbins': 40,
            'learn_rate': 0.2,
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            'variable_importance': False,
            # 'seed':
        }

        model_key = 'covtype_gbm.hex'
        bmResult = h2o.n0.build_model(algo='gbm',
                                      destination_key=model_key,
                                      training_frame=parse_key,
                                      parameters=parameters,
                                      timeoutSecs=60)
        bm = OutputObj(bmResult, 'bm')

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=parse_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')
        print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n"

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=parse_key,
                                        timeoutSecs=60)
        mmResultShort = mmResult['model_metrics'][0]
        del mmResultShort['frame']  # too much!
        mm = OutputObj(mmResultShort, 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=parse_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
Example #21
0
    def test_parse_rand_enum_compress(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DEBUG:
            n = 20
        else:
            n = 1000000

        # from command line arg -long
        if h2o_args.long_test_case:
            repeat = 1000
            scale = 10  # scale up the # of rows
            tryList = [
                (n * scale, 1, 'cI', 300),
                (n * scale, 1, 'cI', 300),
                (n * scale, 1, 'cI', 300),
            ]
        else:
            repeat = 1
            scale = 1
            tryList = [
                (n, 3, 'cI', 300),
                (n, 3, 'cI', 300),
                (n, 3, 'cI', 300),
            ]

        lastcolsHistory = []

        enumList = create_enum_list(listSize=ENUMS_NUM)

        for r in range(repeat):
            SEED_PER_FILE = random.randint(0, sys.maxint)
            for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                # using the comma is nice to ensure no craziness
                colSepHexString = '2c'  # comma
                colSepChar = colSepHexString.decode('hex')
                colSepInt = int(colSepHexString, base=16)
                print "colSepChar:", colSepChar

                rowSepHexString = '0a'  # newline
                rowSepChar = rowSepHexString.decode('hex')
                print "rowSepChar:", rowSepChar

                csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # same enum list/mapping, but different dataset?
                start = time.time()
                lastcols = write_syn_dataset(csvPathname,
                                             enumList,
                                             rowCount,
                                             colCount,
                                             scale=1,
                                             colSepChar=colSepChar,
                                             rowSepChar=rowSepChar,
                                             SEED=SEED_PER_FILE)
                elapsed = time.time() - start
                print "took %s seconds to create %s" % (elapsed, csvPathname)
                # why are we saving this?
                lastcolsHistory.append(lastcols)

                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               check_header=0,
                                               timeoutSecs=30,
                                               separator=colSepInt,
                                               doSummary=DO_SUMMARY)
                parseResultA = h2i.import_parse(path=csvPathname,
                                                schema='put',
                                                hex_key=hex_key)
                # optional. only needed to extract parse_key?
                pA = h2o_cmd.ParseObj(parseResultA,
                                      expectedNumRows=rowCount,
                                      expectedNumCols=colCount)
                print pA.numRows
                print pA.numCols
                print pA.parse_key
                # this guy can take json object as first thing, or re-read with key
                iA = h2o_cmd.InspectObj(pA.parse_key,
                                        expectedNumRows=rowCount,
                                        expectedNumCols=colCount,
                                        expectedMissinglist=[])

                self.assertEqual(rowCount, iA.numRows)
                self.assertEqual(colCount, iA.numCols)
Example #22
0
    def test_w2v_basic_2(self):
        global SYNDATASETS_DIR
        SYNDATASETS_DIR = h2o.make_syn_dir()
        n = 100
        tryList = [
            # (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:

            csvPathname = create_file_with_seps(rowCount, colCount)
            hex_key = "not_used.hex"

            # just parse to make sure it's good
            parseResult = h2i.import_parse(path=csvPathname,
                                           check_header=1,
                                           delete_on_done=0,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            src_key = h2i.find_key('syn_.*csv')

            # no cols ignored
            labelListUsed = list(labelList)
            numColsUsed = numCols
            for trial in range(1):

                parameters = {
                    'validation_frame': parse_key,  # KeyIndexed False []
                    'ignored_columns': None,  # string[] None []
                    'minWordFreq': 1,  # int 5 []
                    'wordModel': 'CBOW',  # enum [u'CBOW', u'SkipGram']
                    'normModel':
                    'NegSampling',  # enum # [u'HSM', u'NegSampling']
                    'negSampleCnt': 1,  # int 5 []
                    'vecSize': 10,  # int 100
                    'windowSize': 2,  # int 5
                    'sentSampleRate': 0.001,  # float 0.001
                    'initLearningRate': 0.05,  # float 0.05
                    'epochs': 1,  # int 5
                }

                model_key = 'benign_w2v.hex'
                bmResult = h2o.n0.build_model(algo='word2vec',
                                              destination_key=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=10)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                prResult = h2o.n0.predict(model=model_key,
                                          frame=parse_key,
                                          timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'],
                               'pr')

                h2o_cmd.runStoreView()
    def test_summary2_NY0(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100, 200, 'x.hex', choicesList[4], expectedList[4]),
            (100, 200, 'x.hex', choicesList[5], expectedList[5]),
            (100, 200, 'x.hex', choicesList[6], expectedList[6]),
            (100, 200, 'x.hex', choicesList[7], expectedList[7]),
            (100, 200, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 200, 'x.hex', choicesList[2], expectedList[2]),
            (10000, 200, 'x.hex', choicesList[1], expectedList[1]),
            (100000, 200, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, choices)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(colCount):
                # walks across the columns triggering a summary on the col desired
                # runSummary returns a column object now. inspect and parse don't. They return json.
                # maybe eventually will make them return object? But I also pass expected stuff to them
                # should I pass expected to summary? no, more complex?
                co = h2o_cmd.runSummary(key=hex_key, column=i)
                print co.label, co.type, co.missing_count, co.domain, sum(
                    co.histogram_bins)

                print "\nComparing column %s to expected" % i
                self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \
                    (i, expectedNaCnt[i], co.missing_count))
                self.assertEqual(rowCount - expectedNaCnt[i],
                                 sum(co.histogram_bins))

            h2p.green_print("\nDone with trial", trial)
            trial += 1

            h2i.delete_keys_at_all_nodes()
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
            # parse setup error ? supposedly fixed now
            # (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]),
            (10000, 1, 'x.hex', -100000, 100000,
             ['C1', None, None, None, None, None]),
            (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]),
            (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None,
                                           None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        for (rowCount, colCount, hex_key, rangeMin, rangeMax,
             expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname,
                                                           rowCount,
                                                           colCount,
                                                           lambd=LAMBD,
                                                           SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxErr = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxErr = 1.05 * maxErr

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            # column 0 not used here
            assert len(expected) == 6
            co = h2o_cmd.runSummary(key=hex_key,
                                    column=0,
                                    expected=expected[1:],
                                    maxDelta=maxErr)
            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "maxErr", maxErr
            if co.label != '' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.99,
                    h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9],

                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                )
Example #25
0
    def test_parse_multi_header_single(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
        ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 rowCount,
                                                 headerData=None,
                                                 rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 dataRowsWithHeader,
                                                 headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)
                print f

            # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node?
            # frames = h2o.nodes[0].frames()['frames']
            frames = h2o.n0.frames()['frames']
            frames_dict = h2o_util.list_to_dict(frames, 'key/name')

            # print "frames:", dump_json(frames)
            # print "frames_dict:", dump_json(frames_dict)

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception(
                        "Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()

            # does h2o-dev take a regex? or do we need to glob
            parseResult = h2i.parse_only(
                pattern='*' + rowxcol + '*',
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                checkHeader="1")  # header_from_file=header

            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=totalDataRows,
                                  expectedNumCols=totalCols)
            print pA.numRows
            print pA.numCols
            print pA.parse_key

            expectedLabelList = headerData.split(",")
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=totalDataRows,
                                    expectedNumCols=totalCols,
                                    expectedMissinglist=[],
                                    expectedLabelList=expectedLabelList)

            if DO_RF:
                # put in an ignore param, that will fail unless headers were parsed correctly
                if HEADER:
                    kwargs = {
                        'sample_rate': 0.75,
                        'max_depth': 25,
                        'ntrees': 1,
                        'ignored_cols_by_name': 'ID,CAPSULE'
                    }
                else:
                    kwargs = {
                        'sample_rate': 0.75,
                        'max_depth': 25,
                        'ntrees': 1
                    }

                rfv = h2o_cmd.runRF(parseResult=parseResult,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)

            h2o.check_sandbox_for_errors()
    def test_kmeans_sphere100(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        if SHUFFLE_SPHERES:
            # since we create spheres in order
            csvFilename2 = 'syn_spheres100_shuffled.csv'
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2
            h2o_util.file_shuffle(csvPathname, csvPathname2)
        else:
            csvFilename2 = csvFilename
            csvPathname2 = csvPathname

        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex")
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        numColsUsed = numCols
        labelListUsed = labelList

        ### h2b.browseTheCloud()

        # try 5 times, to see if all inits by h2o are good
        # does it break if cols is not specified?
        destination_key = 'syn_spheres100.hex'
        cols = ",".join(map(str,range(DIMENSIONS)))
        for trial in range(2):
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                'score_each_iteration': False,
                'k': CLUSTERS,
                'max_iterations': 50,
                'standardize': False,
                # 'seed': kmeansSeed,
                'init': 'Furthest',
            }

            timeoutSecs = 100
            model_key = 'sphere100_k.hex'
            kmeansResult = h2o.n0.build_model(
                algo='kmeans',
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=timeoutSecs)

            modelResult = h2o.n0.models(key=model_key)
            km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed)

            # no expected row/error?
            expected = [(None, c, None, None) for c in centersList] 
            expected.sort(key=lambda tup: sum(tup[1]))
            h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta=[.01, .01, .01])

            print "Trial #", trial, "completed"
Example #27
0
    def test_DL_mnist(self):
        h2o.nodes[0].remove_all_keys()
        csvPathname_train = 'laptop/mnist/train.csv.gz'
        csvPathname_test = 'laptop/mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='bigdata',
                                       path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        numCols = iA.numCols
        labelList = iA.labelList
        parseResultV = h2i.import_parse(bucket='bigdata',
                                        path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=False)

        response = numCols - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        parameters = {
            'validation_frame': validation_key,  # KeyIndexed None
            'ignored_columns': None,  # string[] None
            'response_column': labelList[response],  # string None
            'balance_classes': None,  # boolean false
            'max_after_balance_size': None,  # float Infinity
            'keep_cross_validation_splits': None,  # boolean false
            'checkpoint': None,  # Key None
            'overwrite_with_best_model': None,  # boolean true
            'expert_mode': None,  # boolean false
            'autoencoder': None,  # boolean false
            'use_all_factor_levels': None,  # boolean true
            # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout']
            'activation': 'RectifierWithDropout',  # enum Rectifier 
            'hidden': '[117,131,129]',  # int[] [200, 200]
            'epochs': 2.0,  # double 10.0
            'train_samples_per_iteration': None,  # long -2
            'target_ratio_comm_to_comp': None,  # double 0.02
            'seed': None,  # long 1679194146842485659
            'adaptive_rate': False,  # boolean true
            'rho': None,  # double 0.99
            'epsilon': None,  # double 1.0E-8
            'rate': None,  # double 0.005
            'rate_annealing': None,  # double 1.0E-6
            'rate_decay': None,  # double 1.0
            'momentum_start': 0.5,  # double 0.0
            'momentum_ramp': 100000,  # double 1000000.0
            'momentum_stable': 0.9,  # double 0.0
            'nesterov_accelerated_gradient': None,  # boolean true
            'input_dropout_ratio': 0.2,  # double 0.0
            'hidden_dropout_ratios': None,  # double[] None (this can grid?)
            'l1': 1e-5,  # double 0.0
            'l2': 1e-7,  # double 0.0
            'max_w2': 15,  # float Infinity
            'initial_weight_distribution':
            None,  # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal']
            'initial_weight_scale': None,  # double 1.0
            'loss':
            'CrossEntropy',  # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy']
            'score_interval': None,  # double 5.0
            'score_training_samples': None,  # long 10000
            'score_validation_samples': None,  # long 0
            'score_duty_cycle': None,  # double 0.1
            'classification_stop': None,  # double 0.0
            'regression_stop': None,  # double 1.0E-6
            'quiet_mode': None,  # boolean false
            'max_confusion_matrix_size': None,  # int 20
            'max_hit_ratio_k': None,  # int 10
            'balance_classes': None,  # boolean false
            'class_sampling_factors': None,  # float[] None
            'max_after_balance_size': None,  # float Infinity
            'score_validation_sampling':
            None,  # enum Uniform [u'Uniform', u'Stratified']
            'diagnostics': None,  # boolean true
            'variable_importances': None,  # boolean false
            'fast_mode': None,  # boolean true
            'ignore_const_cols': None,  # boolean true
            'force_load_balance': None,  # boolean true
            'replicate_training_data': None,  # boolean false
            'single_node_mode': None,  # boolean false
            'shuffle_training_data': None,  # boolean false
            'missing_values_handling':
            None,  # enum MeanImputation [u'Skip', u'MeanImputation']
            'sparse': None,  # boolean false
            'col_major': None,  # boolean false
            'average_activation': None,  # double 0.0
            'sparsity_beta': None,  # double 0.0
        }
        expectedErr = 0.057  ## expected validation error for the above model
        relTol = 0.20  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 60
        start = time.time()

        bmResult = h2o.n0.build_model(algo='deeplearning',
                                      model_id=model_key,
                                      training_frame=hex_key,
                                      parameters=parameters,
                                      timeoutSecs=timeoutSecs)
        bm = OutputObj(bmResult, 'bm')

        print 'deep learning took', time.time() - start, 'seconds'

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')
        #        print "model:", dump_json(model)

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=validation_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=validation_key,
                                        timeoutSecs=60)
        mm = OutputObj(mmResult['model_metrics'][0], 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=validation_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

        h2o_cmd.runStoreView()

        actualErr = model['errors']['valid_err']
        print "expected classification error: " + format(expectedErr)
        print "actual   classification error: " + format(actualErr)

        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Example #28
0
    def test_GLM_covtype(self):
        importFolderPath = "standard"
        csvFilename = "covtype.data"
        hex_key = "covtype.hex"
        bucket = "home-0xdiag-datasets"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=hex_key, 
            check_header=1, timeoutSecs=180, doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        labelListUsed = list(labelList)
        labelListUsed.remove('C54')
        numColsUsed = numCols - 1
        for trial in range(1):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                # FIX! for now just use a column that's binomial
                'response_column': 'C54',
                # FIX! when is this needed? redundant for binomial?
                'balance_classes': False,
                'max_after_balance_size': None,
                'standardize': False,
                'family': 'binomial', 
                'link': None, 
                'tweedie_variance_power': None,
                'tweedie_link_power': None,
                'alpha': '[1e-4]',
                'lambda': '[0.5,0.25, 0.1]',
                'prior1': None,
                'lambda_search': None,
                'nlambdas': None,
                'lambda_min_ratio': None,
                'use_all_factor_levels': False,
                # NPE with n_folds 2?
                'n_folds': 1,
            }

            model_key = 'covtype_glm.hex'
            bmResult = h2o.n0.build_model(
                algo='glm',
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')


            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
            m1 = mcms.data[1:]
            h0 = mcms.data[0]
            print "\nmcms", tabulate(m1, headers=h0)

            thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
            cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

            if 1==0:
                print ""
                for i,c in enumerate(cmms.cm):
                    print "\ncmms.cm[%s]" % i, tabulate(c)
                print ""
                

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
Example #29
0
    def test_DL_airlines_small(self):
        h2o.nodes[0].remove_all_keys()
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'train.hex'
        validation_key = 'validation.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)

        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=False)
        pAV = h2o_cmd.ParseObj(parseResultV)
        iAV = h2o_cmd.InspectObj(pAV.parse_key)

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        parameters = {
            'validation_frame': validation_key,  # KeyIndexed None
            'ignored_columns': "['IsDepDelayed_REC']",  # string[] None
            'response_column': 'IsDepDelayed',  # string None
            'loss': 'CrossEntropy'
        }
        expectedErr = 0.32  ## expected validation error for the above model
        relTol = 0.15  ## 15% rel. error tolerance due to Hogwild!

        timeoutSecs = 60
        start = time.time()

        bmResult = h2o.n0.build_model(algo='deeplearning',
                                      model_id=model_key,
                                      training_frame=hex_key,
                                      parameters=parameters,
                                      timeoutSecs=timeoutSecs)
        bm = OutputObj(bmResult, 'bm')

        print 'deep learning took', time.time() - start, 'seconds'

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')
        #        print "model:", dump_json(model)

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=validation_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=validation_key,
                                        timeoutSecs=60)
        mm = OutputObj(mmResult['model_metrics'][0], 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=validation_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

        h2o_cmd.runStoreView()

        actualErr = model['errors']['valid_err']
        print "expected classification error: " + format(expectedErr)
        print "actual   classification error: " + format(actualErr)

        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
    def test_GBM_airlines(self):
        files = [
            ('datasets', 'airlines_all.05p.csv', 'airlines_all.05p.hex', 1800,
             'IsDepDelayed'),
            # ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
        ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            model_key = 'GBMModelKey'
            # IsDepDelayed might already be enum, but just to be sure
            parseResult = h2i.import_parse(
                path=csvPathname,
                schema='hdfs',
                hex_key=trainKey,
                columnTypeDict={'IsDepDelayed': 'Enum'},
                timeoutSecs=timeoutSecs)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            labelListUsed = list(labelList)
            numColsUsed = numCols

            parameters = {
                'validation_frame': trainKey,
                # 'ignored_columns': '[CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed]',
                'response_column': response,
                # 'balance_classes':
                # 'max_after_balance_size':
                'ntrees': 2,
                'max_depth': 10,
                'min_rows': 3,
                'nbins': 40,
                'learn_rate': 0.2,
                # 'loss': 'multinomial',
                # FIX! doesn't like it?
                # 'loss': 'Bernoulli',
                # FIX..no variable importance for GBM yet?
                # 'variable_importance': False,
                # 'seed':
            }

            bmResult = h2o.n0.build_model(algo='gbm',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=360)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')
            # print "\nLook!, can use dot notation: cmm.cm.confusion_matrix", cmm.cm.confusion_matrix, "\n"

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame']  # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')