Ejemplo n.º 1
0
    def test_H_Basic(self):
        # maybe best to extra the key from an import? first?
        # this isn't used much, maybe we don't care about this

        h2i.import_only(path="testdir_multi_jvm/syn_test/syn_header.csv")
        headerKey = h2i.find_key('syn_header.csv')
        # comma 44 is separator
        h2i.import_parse(path="testdir_multi_jvm/syn_test/syn[1-2].csv", header=1, header_from_file=headerKey, separator=44)
    
   
        # symbolic links work
        # ln -s /home/0xdiag/datasets home-0xdiag-datasets
        # lrwxrwxrwx 1 kevin kevin     21 Aug 26 22:05 home-0xdiag-datasets -> /home/0xdiag/datasets
        h2i.import_parse(path="standard/covtype.data", bucket="home-0xdiag-datasets")
Ejemplo n.º 2
0
    def test_H_Basic(self):
        # maybe best to extra the key from an import? first?
        # this isn't used much, maybe we don't care about this

        h2i.import_only(path="testdir_multi_jvm/syn_test/syn_header.csv")
        headerKey = h2i.find_key('syn_header.csv')
        # comma 44 is separator
        h2i.import_parse(path="testdir_multi_jvm/syn_test/syn[1-2].csv",
                         header=1,
                         header_from_file=headerKey,
                         separator=44)

        # symbolic links work
        # ln -s /home/0xdiag/datasets home-0xdiag-datasets
        # lrwxrwxrwx 1 kevin kevin     21 Aug 26 22:05 home-0xdiag-datasets -> /home/0xdiag/datasets
        h2i.import_parse(path="standard/covtype.data",
                         bucket="home-0xdiag-datasets")
Ejemplo n.º 3
0
    def test_w2v_basic_2(self):
        global SYNDATASETS_DIR
        SYNDATASETS_DIR = h2o.make_syn_dir()
        n = 100
        tryList = [
            # (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:

            csvPathname = create_file_with_seps(rowCount, colCount)
            hex_key = "not_used.hex"

            # just parse to make sure it's good
            parseResult = h2i.import_parse(path=csvPathname,
                                           check_header=1,
                                           delete_on_done=0,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            src_key = h2i.find_key('syn_.*csv')

            # no cols ignored
            labelListUsed = list(labelList)
            numColsUsed = numCols
            for trial in range(1):

                parameters = {
                    'validation_frame': parse_key,  # KeyIndexed False []
                    'ignored_columns': None,  # string[] None []
                    'minWordFreq': 1,  # int 5 []
                    'wordModel': 'CBOW',  # enum [u'CBOW', u'SkipGram']
                    'normModel':
                    'NegSampling',  # enum # [u'HSM', u'NegSampling']
                    'negSampleCnt': 1,  # int 5 []
                    'vecSize': 10,  # int 100
                    'windowSize': 2,  # int 5
                    'sentSampleRate': 0.001,  # float 0.001
                    'initLearningRate': 0.05,  # float 0.05
                    'epochs': 1,  # int 5
                }

                model_key = 'benign_w2v.hex'
                bmResult = h2o.n0.build_model(algo='word2vec',
                                              destination_key=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=10)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                prResult = h2o.n0.predict(model=model_key,
                                          frame=parse_key,
                                          timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'],
                               'pr')

                h2o_cmd.runStoreView()
    def test_parse_multi_header_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(500):  # max # of cols below is 500
            done = False
            while not done:
                l = random.randint(1, 64)  # random length headers
                headerName = ''.join(
                    [random.choice(allowedLetters) for _ in range(l)])
                # we keep trying if we already have that header name. Has to be unique.
                done = headerName not in headerChoices
            headerChoices.append(headerName)

        tryList = [
            (3, 5, 9, 'cA', 60, 0),
            # (3, 5, 25, 'cA', 60, 0),
            # (10, 100, 500, 'cA', 60, 0),
        ]

        for trial in range(20):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0
            # random selection of parse param choices

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1

            DATA_HAS_HDR_ROW = random.randint(0, 1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1)
            # DATA_FIRST_IS_COMMENT = random.randint(0,1)
            # HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            # FIX! doesn't seem to like just comment in the header file
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0

            GZIP_DATA = random.randint(0, 1)
            GZIP_HEADER = random.randint(0, 1)
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])

            HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator'])
            if HEADER_SEP_CHAR_GEN == 'same':
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # don't put a header in a data file with a different separator?
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # Hack: if both data and header files have a header, then, just in case
            # the header and data files should have the same separator
            # if they don't, make header match data
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # New for fvec? if separators are not the same, then the header separator needs to be comma
            if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN:
                HEADER_SEP_CHAR_GEN = ','

            # screw it. make them always match
            HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            if HEADER_SEP_CHAR_GEN in (',', ' '):
                pass
                # extra spaces? Don't add any
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " "

            kwargs = {}
            for k, v in paramsDict.items():
                kwargs[k] = random.choice(v)

            kwargs['separator'] = SEP_CHAR_GEN
            # parse doesn't auto-detect tab. will autodetect space and comma
            if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",":
                del kwargs['separator']
            else:
                kwargs['separator'] = ord(SEP_CHAR_GEN)

            # randomly add leading and trailing white space
            # we have to do this after we save the single char HEADER_SEP_CHAR_GEN
            if SEP_CHAR_GEN in (',', ' '):
                if random.randint(0, 1):
                    SEP_CHAR_GEN = " " + SEP_CHAR_GEN
                if random.randint(0, 1):
                    SEP_CHAR_GEN = SEP_CHAR_GEN + " "

            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-"
            print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-"
            print 'GZIP_DATA:', GZIP_DATA
            print 'GZIP_HEADER:', GZIP_HEADER

            # they need to both use the same separator (h2o rule)
            # can't have duplicates
            hfhList = random.sample(headerChoices, colCount) + ["output"]
            # UPDATE: always use comma or space for header separator?? it should work no matter what
            # separator the data uses?

            headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList)
            print "headerForHeader:", headerForHeader

            # make these different
            # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF
            hfdList = hfhList

            headerForData = SEP_CHAR_GEN.join(hfdList)

            # create data files
            for fileN in range(fileNum):
                csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(
                    trial) + "_" + rowxcol + '_csv'
                csvFilename = 'syn_data_' + csvFilenameSuffix
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(
                    csvPathname,
                    rowCount,
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None),
                    rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT,
                    sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone
                if GZIP_DATA:
                    csvPathnamegz = csvPathname + ".gz"
                    print "gzipping to", csvPathnamegz
                    h2o_util.file_gzip(csvPathname, csvPathnamegz)
                    os.rename(
                        csvPathname, SYNDATASETS_DIR + "/not_used_data_" +
                        csvFilenameSuffix)
                    # pattern match should find the right key with csvPathname

            # create the header file
            hdrFilenameSuffix = str(SEED) + "_" + str(
                trial) + "_" + rowxcol + '_csv'
            hdrFilename = 'syn_header_' + hdrFilenameSuffix
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(
                hdrPathname,
                dataRowsWithHeader,
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None),
                rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT,
                sepChar=SEP_CHAR_GEN)
            # only include header file data rows if the parse pattern includes it
            if PARSE_PATTERN_INCLUDES_HEADER:
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone
            if GZIP_HEADER:
                hdrPathnamegz = hdrPathname + ".gz"
                print "gzipping to", hdrPathnamegz
                h2o_util.file_gzip(hdrPathname, hdrPathnamegz)
                os.rename(
                    hdrPathname,
                    SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix)
                # pattern match should find the right key with hdrPathnameh

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w

            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)

            h2o_cmd.runStoreView()
            headerKey = h2i.find_key(hdrFilename)
            dataKey = h2i.find_key(csvFilename)

            # use regex. the only files in the dir will be the ones we just created
            # with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'header':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'data':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            if HEADER_HAS_HDR_ROW and (kwargs['header_from_file']
                                       == headerKey):
                ignoreForRf = hfhList[0]
            elif DATA_HAS_HDR_ROW:
                ignoreForRf = hfdList[0]
            else:
                ignoreForRf = None

            print "If header_from_file= , required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] = 1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] = 1
            else:
                kwargs['header'] = 0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*'
            else:
                pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*'

            # don't pass to parse
            kwargs.pop('hdr_separator', None)
            parseResult = h2i.parse_only(pattern=pattern,
                                         hex_key=hex_key,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)
            print "parseResult['destination_key']: " + parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header,
            # causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone
                               == 0) and (kwargs['header']
                                          == 1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            h2oGainsOneData = False
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1

            if 1 == 0:  # FIX! don't check for now
                self.assertEqual(inspect['numRows'], totalDataRows,
                    "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \
                    (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it

            kwargs = {
                'sample': 100,
                'depth': 25,
                'ntree': 2,
                'ignore': ignoreForRf
            }
            start = time.time()
            # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
            h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
Ejemplo n.º 5
0
def doGBM(f, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row):
    debug = False
    bench = "bench"
    if debug:
        print "Doing GBM DEBUG"
        bench = "bench/debug"
    #date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre = ""
    if debug: pre    = 'DEBUG'
    gbmbenchcsv = 'benchmarks/'+build+'/'+pre+'gbmbench.csv'
    if not os.path.exists(gbmbenchcsv):
        output = open(gbmbenchcsv,'w')
        output.write(','.join(csv_header)+'\n')
    else:
        output = open(gbmbenchcsv,'a')
    csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                    dialect='excel', extrasaction='ignore',delimiter=',')
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']): 
            csvPathname = importFolderPath + "/" + f + '.csv'
        else: 
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        hex_key = f + '.hex'
        hK = folderPath + "Header.csv"    
        headerPathname = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey = h2i.find_key(hK)
        trainParseWallStart = time.time()
        h2o.beta_features = False #ensure this is false! 
        if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
        parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets',
                                       path             = csvPathname,
                                       schema           = 'local',
                                       hex_key          = hex_key,
                                       header           = 1,
                                       header_from_file = headerKey,
                                       separator        = 44,
                                       timeoutSecs      = 16000,
                                       retryDelaySecs   = 5,
                                       pollTimeoutSecs  = 16000,
                                       noPoll           = True,
                                       doSummary        = False
                                      )
        h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=16000, retryDelaySecs=5)
        parseWallTime = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime ," seconds." 
        h2o.beta_features = False #make sure false for the inspect as well!
        inspect_train  = h2o.nodes[0].inspect(hex_key, timeoutSecs=16000)
        inspect_test   = h2o.nodes[0].inspect(testFilehex, timeoutSecs=16000)
        h2o.beta_features = True #ok, can be true again
        nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row.update( {'h2o_build'          : build,
                     'nMachines'          : nMachines,
                     'nJVMs'              : len(h2o.nodes),
                     'Xmx/JVM'            : java_heap_GB,
                     'dataset'            : f,
                     'nTrainRows'         : inspect_train['num_rows'],
                     'nTestRows'          : inspect_test['num_rows'],
                     'nCols'              : inspect_train['num_cols'],
                     'trainParseWallTime' : parseWallTime,
                     'nTrees'             : ntrees,
                     'minRows'            : minrows,
                     'maxDepth'           : depth,
                     'learnRate'          : learnRate,
                     'classification'     : classification,
                    })
    
        params   =  {'destination_key'      : 'GBM('+f+')',
                     'response'             : response,
                     'ignored_cols_by_name' : ignored_cols,
                     'classification'       : classification,
                     'validation'           : testFilehex,
                     'ntrees'               : ntrees,
                     'max_depth'            : depth,
                     'min_rows'             : minrows,
                     'nbins'                : nbins,
                     'learn_rate'           : learnRate,
                    }
    
        parseResult = {'destination_key' : hex_key}
        kwargs    = params.copy()
        gbmStart  = time.time()
        #TODO(spencer): Uses jobs to poll for gbm completion
        gbm       = h2o_cmd.runGBM(parseResult = parseResult, noPoll=True, timeoutSecs=4800, **kwargs)
        h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=120, retryDelaySecs=5)
        gbmTime   = time.time() - gbmStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        row.update( {'gbmBuildTime'       : gbmTime,
                    })
        gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')')
        if classification:
            cm = gbmTrainView['gbm_model']['cm']
            err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
        else:
            err = gbmTrainView['gbm_model']['errs'][-1]
        row.update({'Error' : err})
        csvWrt.writerow(row)
    finally:
        output.close()
Ejemplo n.º 6
0
def doGLM2(f, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex,
           row, case_mode, case_val):
    debug = False
    bench = "bench"
    if debug:
        print "DOING GLM2 DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(z) for z in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre = ""
    if debug: pre = "DEBUG"
    glm2benchcsv = 'benchmarks/' + build + '/' + pre + 'glm2bench.csv'
    if not os.path.exists(glm2benchcsv):
        output = open(glm2benchcsv, 'w')
        output.write(','.join(csv_header) + '\n')
    else:
        output = open(glm2benchcsv, 'a')
    csvWrt = csv.DictWriter(output,
                            fieldnames=csv_header,
                            restval=None,
                            dialect='excel',
                            extrasaction='ignore',
                            delimiter=',')
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in [
                'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x',
                'AllBedroomsTrain100x'
        ]):
            csvPathname = importFolderPath + "/" + f + '.csv'
        else:
            #print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..."
            #continue
            csvPathname = importFolderPath + "/" + f + "/*"
        hex_key = f + '.hex'
        hK = folderPath + "Header.csv"
        headerPathname = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey = h2i.find_key(hK)
        trainParseWallStart = time.time()
        if f in (['AirlinesTrain10x', 'AirlinesTrain100x']):
            h2o.beta_features = False  #regex parsing acting weird when not using browser, use VA -> FVEC converter
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       header=1,
                                       header_from_file=headerKey,
                                       separator=44,
                                       timeoutSecs=7200,
                                       retryDelaySecs=5,
                                       pollTimeoutSecs=7200,
                                       noPoll=True,
                                       doSummary=False)
        h2o_jobs.pollWaitJobs(timeoutSecs=7200,
                              pollTimeoutSecs=7200,
                              retryDelaySecs=5)
        parseResult = {'destination_key': hex_key}
        parseWallTime = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime, " seconds."
        h2o.beta_features = True
        inspect_train = h2o.nodes[0].inspect(hex_key, timeoutSecs=7200)
        inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200)
        nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row.update({
            'h2o_build': build,
            'nMachines': nMachines,
            'Xmx/JVM': java_heap_GB,
            'nJVMs': len(h2o.nodes),
            'dataset': f,
            'nTrainRows': inspect_train['numRows'],
            'nTestRows': inspect_test['numRows'],
            'nCols': inspect_train['numCols'],
            'trainParseWallTime': parseWallTime,
            'nfolds': nfolds,
            'family': family,
        })

        params = {
            'vresponse': y,
            'ignored_cols': x,
            'family': family,
            'lambda': lambda_,
            'alpha': alpha,
            'n_folds': nfolds,
            #'case_mode'          : case_mode,
            #'case_val'           : case_val,
            'destination_key': "GLM(" + f + ")",
        }
        h2o.beta_features = True
        kwargs = params.copy()
        glmStart = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult,
                             timeoutSecs=1800,
                             noPoll=True,
                             **kwargs)
        h2o_jobs.pollWaitJobs(timeoutSecs=7200,
                              pollTimeoutSecs=7200,
                              retryDelaySecs=5)
        glmTime = time.time() - glmStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        #glm       = h2o.nodes[0].inspect("GLM("+f+")")
        row.update({
            'glm2BuildTime': glmTime,
            #'AverageErrorOver10Folds'    : glm['glm_model']['validations'][0]['err'],
        })
        #if "Bedrooms" in f:
        #print "Sleeping 30"
        #time.sleep(30)
        glmView = h2o_cmd.runGLMView(modelKey="GLM(" + f + ")",
                                     timeoutSecs=380)

        #glmScoreStart = time.time()
        #glmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
        #scoreTime     = time.time() - glmScoreStart
        row.update({
            'AIC': glmView['glm_model']['validation']['aic'],
            'nIterations': glmView['glm_model']['iteration'],
            'nPredictors': len(glmView['glm_model']['beta']),
            #'AverageError' : glmView['glm_model']['validation']['avg_err'],
        })
        if family == "binomial":
            #Scrape html of 2/glmmodelview to get best threshold,
            #then, multiply by 100 and cast to int...
            #then ask for the coresponding CM from _cms inside glmView
            url = 'http://%s:%d/2/GLMModelView.html?_modelKey=%s' % (
                h2o.nodes[0].http_addr, 55555, 'GLM(' + f + ')')
            r = requests.get(url).text
            p1 = re.compile('threshold[:<>/a-z]*[0-9]\.[0-9]*')
            p2 = re.compile('[0-9]\.[0-9]*')
            best = int(float(p2.search(p1.search(r).group()).group()) * 100)
            best_cm = glmView['glm_model']['validation']['_cms'][best]['_arr']
            avg_err = 1.0 * (best_cm[0][1] + best_cm[1][0] + 0.0) / (sum(
                [i for sublist in best_cm for i in sublist]))
            row.update( {#'scoreTime'          : scoreTime,
                         'AUC'                : glmView['glm_model']['validation']['auc'],
                         'AverageError'       : avg_err,
                        })
        else:
            row.update( {#'scoreTime'          : scoreTime,
                         'AUC'                : 'NA',
                         'AverageError'       : glmView['glm_model']['validation']['avg_err'],
                        })
        csvWrt.writerow(row)
    finally:
        output.close()
Ejemplo n.º 7
0
        finally:
            output.close()


if __name__ == '__main__':
    debug = sys.argv.pop(-1)
    build = sys.argv.pop(-1)
    h2o.parse_our_args()
    h2o_hosts.build_cloud_with_hosts(enable_benchmark_log=False)

    #AIRLINES
    airlinesTestParseStart = time.time()
    hK = "AirlinesHeader.csv"
    headerPathname = "bench/Airlines" + "/" + hK
    h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
    headerKey = h2i.find_key(hK)
    testFile = h2i.import_parse(bucket='home-0xdiag-datasets',
                                path='bench/Airlines/AirlinesTest.csv',
                                schema='local',
                                hex_key="atest.hex",
                                header=1,
                                header_from_file=headerKey,
                                separator=44,
                                timeoutSecs=4800,
                                retryDelaySecs=5,
                                pollTimeoutSecs=4800)
    elapsedAirlinesTestParse = time.time() - airlinesTestParseStart
    row = {'testParseWallTime': elapsedAirlinesTestParse}
    response = 'IsDepDelayed'
    ignored = None
    doGBM(files['Airlines'],
    def test_parse_multi_header_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(500): # max # of cols below is 500
            done = False
            while not done:
                l = random.randint(1,64) # random length headers
                headerName = ''.join([random.choice(allowedLetters) for _ in range(l)])
                # we keep trying if we already have that header name. Has to be unique.
                done = headerName not in headerChoices
            headerChoices.append(headerName)

        tryList = [
            (3, 5, 9, 'cA', 60, 0),
            # (3, 5, 25, 'cA', 60, 0),
            # (10, 100, 500, 'cA', 60, 0),
            ]

        for trial in range(20):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0
            # random selection of parse param choices

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1
            
            DATA_HAS_HDR_ROW = random.randint(0,1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1)
            # DATA_FIRST_IS_COMMENT = random.randint(0,1)
            # HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            # FIX! doesn't seem to like just comment in the header file
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0
            
            GZIP_DATA = random.randint(0,1)
            GZIP_HEADER = random.randint(0,1)
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])

            HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator'])
            if HEADER_SEP_CHAR_GEN == 'same':
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # don't put a header in a data file with a different separator?
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # Hack: if both data and header files have a header, then, just in case
            # the header and data files should have the same separator
            # if they don't, make header match data
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # New for fvec? if separators are not the same, then the header separator needs to be comma
            if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN:
                HEADER_SEP_CHAR_GEN = ','


            # screw it. make them always match
            HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            if HEADER_SEP_CHAR_GEN in (',', ' '):
                pass
                # extra spaces? Don't add any
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " "

            kwargs = {}
            for k,v in paramsDict.items():
                kwargs[k] = random.choice(v)

            kwargs['separator'] = SEP_CHAR_GEN
            # parse doesn't auto-detect tab. will autodetect space and comma
            if SEP_CHAR_GEN==" "  or SEP_CHAR_GEN==",": 
                del kwargs['separator']
            else:
                kwargs['separator'] = ord(SEP_CHAR_GEN)
            
            # randomly add leading and trailing white space
            # we have to do this after we save the single char HEADER_SEP_CHAR_GEN
            if SEP_CHAR_GEN in (',', ' '):
                if random.randint(0,1):
                    SEP_CHAR_GEN = " " + SEP_CHAR_GEN
                if random.randint(0,1):
                    SEP_CHAR_GEN = SEP_CHAR_GEN + " "


            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-"
            print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-"
            print 'GZIP_DATA:', GZIP_DATA
            print 'GZIP_HEADER:', GZIP_HEADER 

            # they need to both use the same separator (h2o rule)
# can't have duplicates
            hfhList = random.sample(headerChoices, colCount) + ["output"]
            # UPDATE: always use comma or space for header separator?? it should work no matter what 
            # separator the data uses?

            headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList)
            print "headerForHeader:", headerForHeader

            
            # make these different
            # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF
            hfdList = hfhList

            headerForData   = SEP_CHAR_GEN.join(hfdList)

        
            # create data files
            for fileN in range(fileNum):
                csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv'
                csvFilename = 'syn_data_' + csvFilenameSuffix
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, 
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone
                if GZIP_DATA:
                    csvPathnamegz = csvPathname + ".gz"
                    print "gzipping to", csvPathnamegz
                    h2o_util.file_gzip(csvPathname, csvPathnamegz)
                    os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix)
                    # pattern match should find the right key with csvPathname


            # create the header file
            hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv'
            hdrFilename = 'syn_header_' + hdrFilenameSuffix
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, 
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
            # only include header file data rows if the parse pattern includes it
            if PARSE_PATTERN_INCLUDES_HEADER: 
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone
            if GZIP_HEADER:
                hdrPathnamegz = hdrPathname + ".gz"
                print "gzipping to", hdrPathnamegz
                h2o_util.file_gzip(hdrPathname, hdrPathnamegz)
                os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix)
                # pattern match should find the right key with hdrPathnameh

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w

            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True)

            h2o_cmd.runStoreView()
            headerKey = h2i.find_key(hdrFilename)
            dataKey = h2i.find_key(csvFilename)

            # use regex. the only files in the dir will be the ones we just created 
            # with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'header':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'data':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey):
                ignoreForRf = hfhList[0]
            elif DATA_HAS_HDR_ROW:
                ignoreForRf = hfdList[0]
            else:
                ignoreForRf = None

            print "If header_from_file= , required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] =  1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] =  1
            else:
                kwargs['header'] =  0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*'
            else:
                pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*'

            # don't pass to parse
            kwargs.pop('hdr_separator', None)
            parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
            print "parseResult['destination_key']: " + parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header, 
            # causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            h2oGainsOneData = False
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1
                
            if 1==0: # FIX! don't check for now
                self.assertEqual(inspect['numRows'], totalDataRows,
                    "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \
                    (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it

            kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf}
            start = time.time()
            # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
            h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
Ejemplo n.º 9
0
    def test_w2v_basic_2(self):
        global SYNDATASETS_DIR
        SYNDATASETS_DIR = h2o.make_syn_dir()
        n = 100
        tryList = [
            # (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:

            csvPathname = create_file_with_seps(rowCount, colCount)
            hex_key = "not_used.hex"

            # just parse to make sure it's good
            parseResult = h2i.import_parse(path=csvPathname,
                checkHeader=1, delete_on_done = 0, timeoutSecs=180, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspectResult = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

            src_key = h2i.find_key('syn_.*csv')

            # no cols ignored
            labelListUsed = list(labelList)
            numColsUsed = numCols
            for trial in range(1):

                parameters = {
                    'validation_frame': parse_key, # Frame False []
                    'ignored_columns': None, # string[] None []
                    'score_each_iteration': None, # boolean false []

                    'minWordFreq': 1, # int 5 []
                    'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram']
                    'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling']
                    'negSampleCnt': 1,# int 5 []
                    'vecSize': 10,  # int 100
                    'windowSize': 2,  # int 5
                    'sentSampleRate': 0.001,  # float 0.001
                    'initLearningRate': 0.05,  # float 0.05
                    'epochs': 1, # int 5
                }

                model_key = 'benign_w2v.hex'
                bmResult = h2o.n0.build_model(
                    algo='word2vec', 
                    destination_key=model_key,
                    training_frame=parse_key,
                    parameters=parameters, 
                    timeoutSecs=10) 
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics( model=model_key, frame=parse_key, timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
        
                h2o_cmd.runStoreView()
Ejemplo n.º 10
0
def doKMeans(f, folderPath): 
    debug = False
    bench = "bench"
    if debug:
        print "Debugging KMEANS"
        bench = "bench/debug"
    #date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre = ""
    if debug: pre    = "DEBUG"
    kmeansbenchcsv   = 'benchmarks/'+build+'/'+pre+'kmeansbench.csv'
    if not os.path.exists(kmeansbenchcsv):
        output       = open(kmeansbenchcsv,'w')
        output.write(','.join(csv_header)+'\n')
    else:
        output       = open(kmeansbenchcsv,'a')
    csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                    dialect='excel', extrasaction='ignore',delimiter=',')
    try:
        java_heap_GB     = h2o.nodes[0].java_heap_GB
        #Train File Parsing#
        importFolderPath = bench + "/" + folderPath
        if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): 
            csvPathname = importFolderPath + "/" + f + '.csv'
        else: 
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        hex_key         = f + '.hex'
        hK              = folderPath + "Header.csv"
        headerPathname  = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey       = h2i.find_key(hK)
        trainParseWallStart = time.time()
        parseResult     = h2i.import_parse(bucket           = 'home-0xdiag-datasets', 
                                           path             = csvPathname, 
                                           schema           = 'local', 
                                           hex_key          = hex_key, 
                                           header           = 1, 
                                           header_from_file = headerKey, 
                                           separator        = 44,
                                           timeoutSecs      = 7200,
                                           retryDelaySecs   = 5,
                                           pollTimeoutSecs  = 7200,
                                           doSummary        = False
                                          )
        parseWallTime   = time.time() - trainParseWallStart
        #End Train File Parse#
        print "Parsing training file took ", parseWallTime ," seconds." 
        
        inspect         = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200)
        
        nMachines       = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) 
        row             =  {'h2o_build'          : build,
                            'nMachines'          : nMachines,
                            'nJVMs'              : len(h2o.nodes),
                            'Xmx/JVM'            : java_heap_GB,
                            'dataset'            : f,
                            'nRows'              : inspect['num_rows'],
                            'nCols'              : inspect['num_cols'],
                            'parseWallTime'      : parseWallTime,
                            'k'                  : 6, 
                            'max_iter'           : 100,
                            'init'               : 'Furthest',
                           }
    
        params          =  {'source_key'         : hex_key,
                            'k'                  : 6,
                            'initialization'     : 'Furthest',
                            'max_iter'           : 100,
                            'seed'               : 1234567,
                            'normalize'          : 0,
                            #'cols'               : ,
                            'destination_key'    : "KMeans("+f+")",
                           }
        kwargs          = params.copy()
        kmeansStart     = time.time()
        kmeans          = h2o_cmd.runKMeans(parseResult=parseResult, 
                                            timeoutSecs=7200,
                                             **kwargs)
        kmeansTime      = time.time() - kmeansStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        row.update({'kmeansBuildTime' : kmeansTime})
        csvWrt.writerow(row)
    finally:
        output.close()
Ejemplo n.º 11
0
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row):
    bench = "bench"
    if debug:
        print "Doing GBM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    for f in fs['train']:
        overallWallStart = time.time()
        pre = ""
        if debug: pre    = 'DEBUG'
        gbmbenchcsv = 'benchmarks/'+build+'/'+date+'/'+pre+'gbmbench.csv'
        if not os.path.exists(gbmbenchcsv):
            output = open(gbmbenchcsv,'w')
            output.write(','.join(csv_header)+'\n')
        else:
            output = open(gbmbenchcsv,'a')
        csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                        dialect='excel', extrasaction='ignore',delimiter=',')
        try:
            java_heap_GB = h2o.nodes[0].java_heap_GB
            importFolderPath = bench + folderPath
            if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']): 
                csvPathname = importFolderPath + "/" + f + '.csv'
            else: 
                csvPathname = importFolderPath + "/" + f + "/*linked*"
            hex_key = f + '.hex'
            hK = folderPath + "Header.csv"    
            headerPathname = importFolderPath + "/" + hK
            h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
            headerKey = h2i.find_key(hK)
            trainParseWallStart = time.time()
            parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets',
                                           path             = csvPathname,
                                           schema           = 'local',
                                           hex_key          = hex_key,
                                           header           = 1,
                                           header_from_file = headerKey,
                                           separator        = 44,
                                           timeoutSecs      = 7200,
                                           retryDelaySecs   = 5,
                                           pollTimeoutSecs  = 7200
                                          )             

            parseWallTime = time.time() - trainParseWallStart
            print "Parsing training file took ", parseWallTime ," seconds." 
        
            inspect_train  = h2o.nodes[0].inspect(parseResult['destination_key'])
            inspect_test   = h2o.nodes[0].inspect(testFilehex)
            
            nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
            row.update( {'h2o_build'          : build,
                         'nMachines'          : nMachines,
                         'nJVMs'              : len(h2o.nodes),
                         'Xmx/JVM'            : java_heap_GB,
                         'dataset'            : f,
                         'nTrainRows'         : inspect_train['numRows'],
                         'nTestRows'          : inspect_test['numRows'],
                         'nCols'              : inspect_train['numCols'],
                         'trainParseWallTime' : parseWallTime,
                         'classification'     : classification,
                        })
        
            params   =  {'destination_key'      : 'GBM('+f+')',
                         'response'             : response,
                         'ignored_cols_by_name' : ignored_cols,
                         'classification'       : classification,
                         'validation'           : testFilehex,
                         'ntrees'               : ntrees,
                         'max_depth'            : depth,
                         'min_rows'             : minrows,
                         'nbins'                : nbins,
                         'learn_rate'           : learnRate,
                        }

            kwargs    = params.copy()
            gbmStart  = time.time()
            #TODO(spencer): Uses jobs to poll for gbm completion
            h2o.beta_features = True
            gbm       = h2o_cmd.runGBM(parseResult = parseResult, noPoll=True, timeoutSecs=4800, **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=120, retryDelaySecs=5)
            h2o.beta_features = False
            gbmTime   = time.time() - gbmStart
            row.update( {'gbmBuildTime'       : gbmTime,
                        })
            #TODO(spencer): Add in gbm scoring
            #gbmScoreStart = time.time()
            #gbmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
            #scoreTime     = time.time() - gbmScoreStart
            csvWrt.writerow(row)
        finally:
            output.close()
Ejemplo n.º 12
0
def doGLM2(f, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, row, case_mode, case_val):
    debug = False
    bench = "bench"
    if debug:
        print "DOING GLM2 DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(z) for z in list(time.localtime())][0:3])
    overallWallStart  = time.time()
    pre               = ""
    if debug: pre     = "DEBUG"
    glm2benchcsv      = 'benchmarks/'+build+'/'+date+'/'+pre+'glm2bench.csv'
    if not os.path.exists(glm2benchcsv):
        output = open(glm2benchcsv,'w')
        output.write(','.join(csv_header)+'\n')
    else:
        output = open(glm2benchcsv,'a')
    csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                    dialect='excel', extrasaction='ignore',delimiter=',')
    try:
        java_heap_GB     = h2o.nodes[0].java_heap_GB
        importFolderPath = bench+"/" + folderPath
        if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): 
            csvPathname = importFolderPath + "/" + f + '.csv'
        else:
            #print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..." 
            #continue
            csvPathname = importFolderPath + "/" + f + "/*"
        hex_key         = f + '.hex'
        hK              = folderPath + "Header.csv"    
        headerPathname  = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey       = h2i.find_key(hK)
        trainParseWallStart = time.time()
        if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
        parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets',
                                       path             = csvPathname,
                                       schema           = 'local',
                                       hex_key          = hex_key,
                                       header           = 1,
                                       header_from_file = headerKey,
                                       separator        = 44,
                                       timeoutSecs      = 7200,
                                       retryDelaySecs   = 5,
                                       pollTimeoutSecs  = 7200,
                                       noPoll           = True,
                                       doSummary        = False
                                      )
        h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
        parseResult = {'destination_key':hex_key}
        parseWallTime = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime ," seconds." 
        h2o.beta_features = True
        inspect_train  = h2o.nodes[0].inspect(hex_key, timeoutSecs=7200)
        inspect_test   = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200)
        
        row.update( {'h2o_build'          : build,  
                     'java_heap_GB'       : java_heap_GB,
                     'dataset'            : f,
                     'nTrainRows'         : inspect_train['numRows'],
                     'nTestRows'          : inspect_test['numRows'],
                     'nCols'              : inspect_train['numCols'],
                     'trainParseWallTime' : parseWallTime,
                     'nfolds'             : nfolds,
                    })
    
        params   =  {'vresponse'       : y,
                     'ignored_cols'    : x,
                     'family'          : family,
                     'lambda'          : lambda_,
                     'alpha'           : alpha,
                     'n_folds'         : nfolds,
                     #'case_mode'          : case_mode,
                     #'case_val'           : case_val, 
                     'destination_key' : "GLM("+f+")",
                    }
        h2o.beta_features = True
        kwargs    = params.copy()
        glmStart  = time.time()
        glm       = h2o_cmd.runGLM(parseResult = parseResult, timeoutSecs=1800, noPoll=True, **kwargs)
        h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
        glmTime   = time.time() - glmStart
        #glm       = h2o.nodes[0].inspect("GLM("+f+")")
        row.update( {'glmBuildTime'       : glmTime,
                     #'AverageErrorOver10Folds'    : glm['glm_model']['validations'][0]['err'],
                    })
        #if "Bedrooms" in f: 
            #print "Sleeping 30"
            #time.sleep(30)
        glmView = h2o_cmd.runGLMView(modelKey = "GLM("+f+")", timeoutSecs=380)

        #glmScoreStart = time.time()
        #glmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
        #scoreTime     = time.time() - glmScoreStart
        row.update( {'AIC'          : glmView['glm_model']['validation']['aic'],
                     'nIterations'  : glmView['glm_model']['iteration'],
                     'nPredictors'  : len(glmView['glm_model']['beta']),
                     #'AverageError' : glmView['glm_model']['validation']['avg_err'],
                    })
        if family == "binomial":
            #Scrape html of 2/glmmodelview to get best threshold,
            #then, multiply by 100 and cast to int...
            #then ask for the coresponding CM from _cms inside glmView
            url     = 'http://%s:%d/2/GLMModelView.html?_modelKey=%s' % (h2o.nodes[0].http_addr, 55555, 'GLM('+f+')')
            r       = requests.get(url).text
            p1      = re.compile('threshold[:<>/a-z]*[0-9]\.[0-9]*')
            p2      = re.compile('[0-9]\.[0-9]*')
            best    = int(float(p2.search(p1.search(text).group()).group()) * 100)
            best_cm = glmView['glm_model']['validation']['_cms'][best]['_arr']
            avg_err = (best_cm[0][1] + best_cm[1][0]) / (sum([i for sublist in best_cm for i in sublist]))
            row.update( {#'scoreTime'          : scoreTime,
                         'AUC'                : glmView['glm_model']['validation']['auc'],
                         'AverageError'       : avg_err,
                        })
        else:
            row.update( {#'scoreTime'          : scoreTime,
                         'AUC'                : 'NA',
                         'AverageError'       : glmView['glm_model']['validation']['avg_err'],
                        })
        csvWrt.writerow(row)
    finally:
        output.close()
Ejemplo n.º 13
0
def doKMeans(f, folderPath):
    debug = False
    bench = "bench"
    if debug:
        print "Debugging KMEANS"
        bench = "bench/debug"
    #date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre = ""
    if debug: pre = "DEBUG"
    kmeansbenchcsv = 'benchmarks/' + build + '/' + pre + 'kmeansbench.csv'
    if not os.path.exists(kmeansbenchcsv):
        output = open(kmeansbenchcsv, 'w')
        output.write(','.join(csv_header) + '\n')
    else:
        output = open(kmeansbenchcsv, 'a')
    csvWrt = csv.DictWriter(output,
                            fieldnames=csv_header,
                            restval=None,
                            dialect='excel',
                            extrasaction='ignore',
                            delimiter=',')
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        #Train File Parsing#
        importFolderPath = bench + "/" + folderPath
        if (f in [
                'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x',
                'AllBedroomsTrain100x'
        ]):
            csvPathname = importFolderPath + "/" + f + '.csv'
        else:
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        hex_key = f + '.hex'
        hK = folderPath + "Header.csv"
        headerPathname = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey = h2i.find_key(hK)
        trainParseWallStart = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       header=1,
                                       header_from_file=headerKey,
                                       separator=44,
                                       timeoutSecs=7200,
                                       retryDelaySecs=5,
                                       pollTimeoutSecs=7200,
                                       doSummary=False)
        parseWallTime = time.time() - trainParseWallStart
        #End Train File Parse#
        print "Parsing training file took ", parseWallTime, " seconds."

        inspect = h2o.nodes[0].inspect(parseResult['destination_key'],
                                       timeoutSecs=7200)

        nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row = {
            'h2o_build': build,
            'nMachines': nMachines,
            'nJVMs': len(h2o.nodes),
            'Xmx/JVM': java_heap_GB,
            'dataset': f,
            'nRows': inspect['num_rows'],
            'nCols': inspect['num_cols'],
            'parseWallTime': parseWallTime,
            'k': 6,
            'max_iter': 100,
            'init': 'Furthest',
        }

        params = {
            'source_key': hex_key,
            'k': 6,
            'initialization': 'Furthest',
            'max_iter': 100,
            'seed': 1234567,
            'normalize': 0,
            #'cols'               : ,
            'destination_key': "KMeans(" + f + ")",
        }
        kwargs = params.copy()
        kmeansStart = time.time()
        kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                   timeoutSecs=7200,
                                   **kwargs)
        kmeansTime = time.time() - kmeansStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        row.update({'kmeansBuildTime': kmeansTime})
        csvWrt.writerow(row)
    finally:
        output.close()
Ejemplo n.º 14
0
def doGLM(f, folderPath, family, link, lambda_, alpha, nfolds, y, x,
          testFilehex, row):
    debug = False
    bench = "bench"
    if debug:
        print "DOING GLM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(z) for z in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre = ""
    if debug: pre = "DEBUG"
    glmbenchcsv = 'benchmarks/' + build + '/' + pre + 'glmbench.csv'
    if not os.path.exists(glmbenchcsv):
        output = open(glmbenchcsv, 'w')
        output.write(','.join(csv_header) + '\n')
    else:
        output = open(glmbenchcsv, 'a')
    csvWrt = csv.DictWriter(output,
                            fieldnames=csv_header,
                            restval=None,
                            dialect='excel',
                            extrasaction='ignore',
                            delimiter=',')
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in [
                'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x',
                'AllBedroomsTrain100x'
        ]):
            csvPathname = importFolderPath + "/" + f + '.csv'
        else:
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        hex_key = f + '.hex'
        hK = folderPath + "Header.csv"
        headerPathname = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey = h2i.find_key(hK)
        trainParseWallStart = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       header=1,
                                       header_from_file=headerKey,
                                       separator=44,
                                       timeoutSecs=7200,
                                       retryDelaySecs=5,
                                       pollTimeoutSecs=7200,
                                       doSummary=False)

        parseWallTime = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime, " seconds."
        inspect_train = h2o.nodes[0].inspect(parseResult['destination_key'],
                                             timeoutSecs=7200)
        inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200)

        nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row.update({
            'h2o_build': build,
            'nMachines': nMachines,
            'nJVMs': len(h2o.nodes),
            'Xmx/JVM': java_heap_GB,
            'dataset': f,
            'nTrainRows': inspect_train['num_rows'],
            'nTestRows': inspect_test['num_rows'],
            'nCols': inspect_train['num_cols'],
            'trainParseWallTime': parseWallTime,
            'nfolds': nfolds,
            'family': family,
        })

        params = {
            'y': y,
            'x': x,
            'family': family,
            'link': link,
            'lambda': lambda_,
            'alpha': alpha,
            'n_folds': nfolds,
            'case_mode': "n/a",
            'destination_key': "GLM(" + f + ")",
            'expert_settings': 0,
        }

        kwargs = params.copy()
        glmStart = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult,
                             timeoutSecs=7200,
                             **kwargs)
        glmTime = time.time() - glmStart
        row.update({
            'glmBuildTime': glmTime,
            #'AverageErrorOver10Folds'    : glm['GLMModel']['validations'][0]['err'],
        })

        glmScoreStart = time.time()
        glmScore = h2o_cmd.runGLMScore(key=testFilehex,
                                       model_key=params['destination_key'],
                                       timeoutSecs=1800)
        scoreTime = time.time() - glmScoreStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        if family == "binomial":
            row.update({
                'scoreTime': scoreTime,
                'AUC': glmScore['validation']['auc'],
                'AIC': glmScore['validation']['aic'],
                'error': glmScore['validation']['err'],
            })
        else:
            row.update({
                'scoreTime': scoreTime,
                'AIC': glmScore['validation']['aic'],
                'AUC': 'NA',
                'error': glmScore['validation']['err'],
            })
        csvWrt.writerow(row)
    finally:
        output.close()
Ejemplo n.º 15
0
    def test_parse_multi_header_single(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
        ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 rowCount,
                                                 headerData=None,
                                                 rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 dataRowsWithHeader,
                                                 headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)
                print f

            # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node?
            # frames = h2o.nodes[0].frames()['frames']
            frames = h2o.n0.frames()['frames']
            frames_dict = h2o_util.list_to_dict(frames, 'key/name')

            # print "frames:", dump_json(frames)
            # print "frames_dict:", dump_json(frames_dict)

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception(
                        "Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()

            # does h2o-dev take a regex? or do we need to glob
            parseResult = h2i.parse_only(
                pattern='*' + rowxcol + '*',
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                checkHeader="1")  # header_from_file=header

            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=totalDataRows,
                                  expectedNumCols=totalCols)
            print pA.numRows
            print pA.numCols
            print pA.parse_key

            expectedLabelList = headerData.split(",")
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=totalDataRows,
                                    expectedNumCols=totalCols,
                                    expectedMissinglist=[],
                                    expectedLabelList=expectedLabelList)

            if DO_RF:
                # put in an ignore param, that will fail unless headers were parsed correctly
                if HEADER:
                    kwargs = {
                        'sample_rate': 0.75,
                        'max_depth': 25,
                        'ntrees': 1,
                        'ignored_cols_by_name': 'ID,CAPSULE'
                    }
                else:
                    kwargs = {
                        'sample_rate': 0.75,
                        'max_depth': 25,
                        'ntrees': 1
                    }

                rfv = h2o_cmd.runRF(parseResult=parseResult,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)

            h2o.check_sandbox_for_errors()
Ejemplo n.º 16
0
def doPCA(f, folderPath):
    debug = False
    bench = "bench"
    if debug:
        print "Doing PCA DEBUG"
        bench = "bench/debug"
    #date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    retryDelaySecs = 5 #if f == 'AirlinesTrain1x' else 30
    overallWallStart = time.time()
    pre = ""
    if debug: pre    = 'DEBUG'
    pcabenchcsv      = 'benchmarks/'+build+'/'+pre+'pcabench.csv'
    if not os.path.exists(pcabenchcsv):
        output = open(pcabenchcsv,'w')
        output.write(','.join(csv_header)+'\n')
    else:
        output = open(pcabenchcsv,'a')
    csvWrt     = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                    dialect='excel', extrasaction='ignore',delimiter=',')
    try:
        java_heap_GB     = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): 
            csvPathname = importFolderPath + "/" + f + '.csv'
        else: 
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        
        hex_key             = f + '.hex'
        trainParseWallStart = time.time()
        hK                  = folderPath + "Header.csv"
        headerPathname      = importFolderPath + "/" + hK
        
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey           = h2i.find_key(hK)
        parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets', 
                                       path             = csvPathname, 
                                       schema           = 'local', 
                                       hex_key          = hex_key,
                                       header           = 1, 
                                       header_from_file = headerKey, 
                                       separator        = 44,
                                       timeoutSecs      = 7200, 
                                       retryDelaySecs   = retryDelaySecs,
                                       pollTimeoutSecs  = 7200,
                                       doSummary        = False
                                      )
        parseWallTime       = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime ," seconds." 
        inspect             = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200)
        
        nMachines           = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row                 =  {'h2o_build'          : build, 
                                'nMachines'          : nMachines,
                                'nJVMs'              : len(h2o.nodes),
                                'Xmx/JVM'            : java_heap_GB,
                                'dataset'            : f,
                                'nRows'              : inspect['num_rows'],
                                'nCols'              : inspect['num_cols'],
                                'parseWallTime'      : parseWallTime,
                               }
    
        params              =  {'destination_key'    : "python_PCA_key",
                                'tolerance'          : 0.0,
                                'standardize'        : 1,
                               }

        kwargs              = params.copy()
        pcaStart            = time.time()
        #h2o.beta_features   = True
        pcaResult = h2o_cmd.runPCA(parseResult = parseResult, noPoll = True,
                                   timeoutSecs = 7200, 
                                   **kwargs)

        h2j.pollWaitJobs(timeoutSecs=4800, pollTimeoutSecs=4800, retryDelaySecs=2)
        pcaTime   = time.time() - pcaStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        #stop all loggers
        os.system(cmd)
        row.update({'pcaBuildTime' : pcaTime})
        csvWrt.writerow(row)
    finally:
        output.close()
Ejemplo n.º 17
0
    def test_parse_multi_header_single(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
            ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True)
                print f

            # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node?
            # frames = h2o.nodes[0].frames()['frames']
            frames = h2o.n0.frames()['frames']
            frames_dict = h2o_util.list_to_dict(frames, 'key/name')

            # print "frames:", dump_json(frames)
            # print "frames_dict:", dump_json(frames_dict)

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception("Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()

            # does h2o-dev take a regex? or do we need to glob
            parseResult = h2i.parse_only(pattern='*'+rowxcol+'*',
                hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header

            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols)
            print pA.numRows
            print pA.numCols
            print pA.parse_key

            expectedLabelList = headerData.split(",")
            iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols,
                expectedMissinglist=[], expectedLabelList=expectedLabelList)

            if DO_RF:
                # put in an ignore param, that will fail unless headers were parsed correctly
                if HEADER:
                    kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE'}
                else:
                    kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1}

                rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            h2o.check_sandbox_for_errors()
    def test_parse_multi_header_single_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
            ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True)
                print f

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception("Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()
            parseResult = h2i.parse_only(pattern='*'+rowxcol+'*',
                hex_key=hex_key, timeoutSecs=timeoutSecs, header="1", header_from_file=header)

            print "parseResult['destination_key']: " + parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, 
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))
            self.assertEqual(inspect['numRows'], totalDataRows,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            if HEADER:
                kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE'}
            else:
                kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1}

            start = time.time()
            rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
Ejemplo n.º 19
0
def doSUM(f, folderPath):
    debug = False
    bench = "bench"
    if debug:
        print "Doing SUM DEBUG"
        bench = "bench/debug"
    #date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    retryDelaySecs = 5  #if f == 'AirlinesTrain1x' else 30
    overallWallStart = time.time()
    pre = ""
    if debug: pre = 'DEBUG'
    sumbenchcsv = 'benchmarks/' + build + '/' + pre + 'summarybench.csv'
    if not os.path.exists(sumbenchcsv):
        output = open(sumbenchcsv, 'w')
        output.write(','.join(csv_header) + '\n')
    else:
        output = open(sumbenchcsv, 'a')
    csvWrt = csv.DictWriter(output,
                            fieldnames=csv_header,
                            restval=None,
                            dialect='excel',
                            extrasaction='ignore',
                            delimiter=',')
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in [
                'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x',
                'AllBedroomsTrain100x'
        ]):
            csvPathname = importFolderPath + "/" + f + '.csv'
        else:
            csvPathname = importFolderPath + "/" + f + "/*linked*"

        hex_key = f + '.hex'
        trainParseWallStart = time.time()
        hK = folderPath + "Header.csv"
        headerPathname = importFolderPath + "/" + hK

        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey = h2i.find_key(hK)
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       header=1,
                                       header_from_file=headerKey,
                                       separator=44,
                                       timeoutSecs=7200,
                                       retryDelaySecs=retryDelaySecs,
                                       pollTimeoutSecs=7200,
                                       doSummary=False)
        parseWallTime = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime, " seconds."
        inspect = h2o.nodes[0].inspect(parseResult['destination_key'],
                                       timeoutSecs=7200)

        nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row = {
            'h2o_build': build,
            'nMachines': nMachines,
            'nJVMs': len(h2o.nodes),
            'Xmx/JVM': java_heap_GB,
            'dataset': f,
            'nRows': inspect['num_rows'],
            'nCols': inspect['num_cols'],
            'parseWallTime': parseWallTime,
        }

        sumStart = time.time()
        sumResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=7200)

        sumTime = time.time() - sumStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        row.update({'summaryBuildTime': sumTime})
        csvWrt.writerow(row)
    finally:
        output.close()
    def test_parse_multi_header_single_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
        ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 rowCount,
                                                 headerData=None,
                                                 rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 dataRowsWithHeader,
                                                 headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)
                print f

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception(
                        "Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()
            parseResult = h2i.parse_only(pattern='*' + rowxcol + '*',
                                         hex_key=hex_key,
                                         timeoutSecs=timeoutSecs,
                                         header="1",
                                         header_from_file=header)

            print "parseResult['destination_key']: " + parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['numCols'], totalCols,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['numCols'], totalCols))
            self.assertEqual(inspect['numRows'], totalDataRows,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            if HEADER:
                kwargs = {
                    'sample_rate': 0.75,
                    'max_depth': 25,
                    'ntrees': 1,
                    'ignored_cols_by_name': 'ID,CAPSULE'
                }
            else:
                kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1}

            start = time.time()
            rfv = h2o_cmd.runRF(parseResult=parseResult,
                                timeoutSecs=timeoutSecs,
                                **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
Ejemplo n.º 21
0
        bench = "bench/debug"

    if dat == 'Air1x'    : fs = files['Airlines']['train'][0]
    if dat == 'Air10x'   : fs = files['Airlines']['train'][1]
    if dat == 'Air100x'  : fs = files['Airlines']['train'][2]
    if dat == 'AllB1x'   : fs = files['AllBedrooms']['train'][0]
    if dat == 'AllB10x'  : fs = files['AllBedrooms']['train'][1]
    if dat == 'AllB100x' : fs = files['AllBedrooms']['train'][2]

    if fp == "Airlines":
        #AIRLINES
        airlinesTestParseStart      = time.time()
        hK                          =  "AirlinesHeader.csv"
        headerPathname              = bench+"/Airlines" + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey                   = h2i.find_key(hK)
        testFile                    = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/Airlines/AirlinesTest.csv', schema='local', hex_key="atest.hex", header=1, header_from_file=headerKey, separator=44, noPoll=True,doSummary=False)
        h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=16000, retryDelaySecs=5)
        elapsedAirlinesTestParse    = time.time() - airlinesTestParseStart
        row = {'testParseWallTime' : elapsedAirlinesTestParse}
        response = 'IsDepDelayed'
        ignored  = None
        doGBM(fs, fp,
                ignored_cols    = ignored, 
                classification  = 1,
                testFilehex     = 'atest.hex',
                ntrees          = 100,
                depth           = 5,
                minrows         = 10,
                nbins           = 100,
                learnRate       = 0.01,
Ejemplo n.º 22
0
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees,
          depth, minrows, nbins, learnRate, response, row):
    bench = "bench"
    if debug:
        print "Doing GBM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    for f in fs['train']:
        overallWallStart = time.time()
        pre = ""
        if debug: pre = 'DEBUG'
        gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv'
        if not os.path.exists(gbmbenchcsv):
            output = open(gbmbenchcsv, 'w')
            output.write(','.join(csv_header) + '\n')
        else:
            output = open(gbmbenchcsv, 'a')
        csvWrt = csv.DictWriter(output,
                                fieldnames=csv_header,
                                restval=None,
                                dialect='excel',
                                extrasaction='ignore',
                                delimiter=',')
        try:
            java_heap_GB = h2o.nodes[0].java_heap_GB
            importFolderPath = bench + folderPath
            if (f in [
                    'AirlinesTrain1x', 'AllBedroomsTrain1x',
                    'AllBedroomsTrain10x', 'AllBedroomsTrain100x',
                    'CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x'
            ]):
                csvPathname = importFolderPath + "/" + f + '.csv'
            else:
                csvPathname = importFolderPath + "/" + f + "/*linked*"
            hex_key = f + '.hex'
            hK = folderPath + "Header.csv"
            headerPathname = importFolderPath + "/" + hK
            h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
            headerKey = h2i.find_key(hK)
            trainParseWallStart = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           header=1,
                                           header_from_file=headerKey,
                                           separator=44,
                                           timeoutSecs=7200,
                                           retryDelaySecs=5,
                                           pollTimeoutSecs=7200)

            parseWallTime = time.time() - trainParseWallStart
            print "Parsing training file took ", parseWallTime, " seconds."

            inspect_train = h2o.nodes[0].inspect(
                parseResult['destination_key'])
            inspect_test = h2o.nodes[0].inspect(testFilehex)

            nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(
                h2o_hosts.hosts)
            row.update({
                'h2o_build': build,
                'nMachines': nMachines,
                'nJVMs': len(h2o.nodes),
                'Xmx/JVM': java_heap_GB,
                'dataset': f,
                'nTrainRows': inspect_train['numRows'],
                'nTestRows': inspect_test['numRows'],
                'nCols': inspect_train['numCols'],
                'trainParseWallTime': parseWallTime,
                'classification': classification,
            })

            params = {
                'destination_key': 'GBM(' + f + ')',
                'response': response,
                'ignored_cols_by_name': ignored_cols,
                'classification': classification,
                'validation': testFilehex,
                'ntrees': ntrees,
                'max_depth': depth,
                'min_rows': minrows,
                'nbins': nbins,
                'learn_rate': learnRate,
            }

            kwargs = params.copy()
            gbmStart = time.time()
            #TODO(spencer): Uses jobs to poll for gbm completion
            h2o.beta_features = True
            gbm = h2o_cmd.runGBM(parseResult=parseResult,
                                 noPoll=True,
                                 timeoutSecs=4800,
                                 **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=7200,
                                  pollTimeoutSecs=120,
                                  retryDelaySecs=5)
            h2o.beta_features = False
            gbmTime = time.time() - gbmStart
            row.update({
                'gbmBuildTime': gbmTime,
            })
            #TODO(spencer): Add in gbm scoring
            #gbmScoreStart = time.time()
            #gbmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
            #scoreTime     = time.time() - gbmScoreStart
            csvWrt.writerow(row)
        finally:
            output.close()
Ejemplo n.º 23
0
def doGLM(f, folderPath, family, link, lambda_, alpha, nfolds, y, x, testFilehex, row):
    debug = False
    bench = "bench"
    if debug:
        print "DOING GLM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(z) for z in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre              = ""
    if debug: pre    = "DEBUG"
    glmbenchcsv      = 'benchmarks/'+build+'/'+pre+'glmbench.csv'
    if not os.path.exists(glmbenchcsv):
        output = open(glmbenchcsv,'w')
        output.write(','.join(csv_header)+'\n')
    else:
        output = open(glmbenchcsv,'a')
    csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                    dialect='excel', extrasaction='ignore',delimiter=',')
    try:
        java_heap_GB     = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): 
            csvPathname = importFolderPath + "/" + f + '.csv'
        else: 
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        hex_key         = f + '.hex'
        hK              = folderPath + "Header.csv"    
        headerPathname  = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey       = h2i.find_key(hK)
        trainParseWallStart = time.time()
        parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets', 
                                       path             = csvPathname, 
                                       schema           = 'local', 
                                       hex_key          = hex_key,  
                                       header           = 1, 
                                       header_from_file = headerKey, 
                                       separator        = 44,
                                       timeoutSecs      = 7200,
                                       retryDelaySecs   = 5,
                                       pollTimeoutSecs  = 7200,
                                       doSummary        = False
                                      )

        parseWallTime  = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime ," seconds." 
        inspect_train  = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200)
        inspect_test   = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200)

        nMachines      = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row.update( {'h2o_build'          : build,
                     'nMachines'          : nMachines,
                     'nJVMs'              : len(h2o.nodes),
                     'Xmx/JVM'            : java_heap_GB,
                     'dataset'            : f,
                     'nTrainRows'         : inspect_train['num_rows'],
                     'nTestRows'          : inspect_test['num_rows'],
                     'nCols'              : inspect_train['num_cols'],
                     'trainParseWallTime' : parseWallTime,
                     'nfolds'             : nfolds,
                     'family'             : family,
                    })

        params   =  {'y'                  : y,
                     'x'                  : x,
                     'family'             : family,
                     'link'               : link,
                     'lambda'             : lambda_,
                     'alpha'              : alpha,
                     'n_folds'            : nfolds,
                     'case_mode'          : "n/a",
                     'destination_key'    : "GLM("+f+")",
                     'expert_settings'    : 0,
                    }

        kwargs    = params.copy()
        glmStart  = time.time()
        glm       = h2o_cmd.runGLM(parseResult = parseResult, 
                                   timeoutSecs = 7200, 
                                   **kwargs)
        glmTime   = time.time() - glmStart
        row.update( {'glmBuildTime'       : glmTime,
                     #'AverageErrorOver10Folds'    : glm['GLMModel']['validations'][0]['err'],
                    })
        
        glmScoreStart = time.time()
        glmScore      = h2o_cmd.runGLMScore(key         = testFilehex,
                                            model_key   = params['destination_key'],
                                            timeoutSecs = 1800)
        scoreTime     = time.time() - glmScoreStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        if family == "binomial":
            row.update( {'scoreTime'          : scoreTime,
                         'AUC'                : glmScore['validation']['auc'],
                         'AIC'                : glmScore['validation']['aic'],
                         'error'              : glmScore['validation']['err'],
                        })
        else:
            row.update( {'scoreTime'          : scoreTime,
                         'AIC'                : glmScore['validation']['aic'],
                         'AUC'                : 'NA',
                         'error'              : glmScore['validation']['err'],
                        })
        csvWrt.writerow(row)
    finally:
        output.close()